diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 1f86f058d..0794c9a01 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 1f86f058de0c3f4098dedae2ae8653c335c868a1 +Subproject commit 0794c9a01b5b1e026c6e6f2de59f1206efc3222e diff --git a/data/log.txt b/data/log.txt new file mode 100644 index 000000000..789edf80c --- /dev/null +++ b/data/log.txt @@ -0,0 +1,3453 @@ +[2026-05-18 13:21:00,773 INFO MainThread eval_mmteb_v2.py:426] Will evaluate 10 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SprintDuplicateQuestions', 'T2Reranking', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-18 13:21:00,773 INFO MainThread eval_mmteb_v2.py:434] +============================================================ +[2026-05-18 13:21:00,773 INFO MainThread eval_mmteb_v2.py:435] Evaluating model type: safetensors +[2026-05-18 13:21:00,773 INFO MainThread eval_mmteb_v2.py:436] ============================================================ +[2026-05-18 13:21:00,882 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 13:21:00,955 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 13:21:03,560 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 13:21:04,503 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=8 +[2026-05-18 13:21:04,503 INFO MainThread eval_mmteb_v2.py:294] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 13:21:04,503 INFO MainThread eval_mmteb_v2.py:298] [1/10] Evaluating: BornholmBitextMining +[2026-05-18 13:21:07,714 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 13:21:15,463 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 13:21:15,472 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 13:21:15,484 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 13:21:15,742 INFO MainThread eval_mmteb_v2.py:298] [2/10] Evaluating: FinancialPhrasebankClassification +[2026-05-18 13:21:15,744 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 13:21:18,696 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 13:21:18,697 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:21:19,318 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:21:24,964 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 13:21:24,964 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 13:21:24,965 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:21:25,592 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:23:42,407 INFO MainThread eval_mmteb_v2.py:436] Will evaluate 10 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SprintDuplicateQuestions', 'T2Reranking', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-18 13:23:42,407 INFO MainThread eval_mmteb_v2.py:444] +============================================================ +[2026-05-18 13:23:42,407 INFO MainThread eval_mmteb_v2.py:445] Evaluating model type: safetensors +[2026-05-18 13:23:42,407 INFO MainThread eval_mmteb_v2.py:446] ============================================================ +[2026-05-18 13:23:42,522 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 13:23:42,597 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 13:23:45,767 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 13:23:46,969 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 13:23:46,969 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 13:23:46,970 INFO MainThread eval_mmteb_v2.py:306] [1/10] Evaluating: BornholmBitextMining +[2026-05-18 13:23:50,232 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 13:23:54,968 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 13:23:54,976 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 13:23:54,987 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 13:23:55,017 INFO MainThread eval_mmteb_v2.py:306] [2/10] Evaluating: FinancialPhrasebankClassification +[2026-05-18 13:23:55,018 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 13:23:58,438 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 13:23:58,440 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:23:59,073 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:04,658 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:04,668 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:05,810 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 13:24:06,391 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:06,606 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:06,613 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:07,502 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 13:24:07,996 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:08,208 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:08,217 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:09,413 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 13:24:09,998 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:10,191 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:10,204 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:11,344 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 13:24:11,923 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:12,136 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:12,144 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:13,317 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 13:24:13,805 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:14,007 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:14,017 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:14,907 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 13:24:15,398 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:15,602 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:15,613 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:16,497 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 13:24:16,989 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:17,200 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:17,211 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:18,390 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 13:24:19,003 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:19,222 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:19,235 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:20,362 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 13:24:20,847 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:21,043 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:21,054 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:21,941 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 13:24:21,942 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 13:24:21,975 INFO MainThread eval_mmteb_v2.py:306] [3/10] Evaluating: PoemSentimentClassification +[2026-05-18 13:24:21,977 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 13:24:24,925 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 13:24:24,927 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:24:25,001 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:25,437 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:25,451 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:25,521 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 13:24:25,596 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:25,786 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:25,796 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:25,867 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 13:24:25,942 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:26,146 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:26,158 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:26,236 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 13:24:26,310 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:26,503 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:26,512 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:26,583 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 13:24:26,657 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:26,865 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:26,880 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:26,950 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 13:24:27,024 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:27,228 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:27,244 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:27,313 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 13:24:27,388 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:27,581 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:27,594 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:27,664 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 13:24:27,738 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:27,942 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:27,953 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:28,023 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 13:24:28,098 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:28,290 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:28,301 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:28,372 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 13:24:28,446 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:28,650 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:28,662 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:28,732 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 13:24:28,732 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 13:24:28,734 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:24:28,807 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:29,322 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:29,333 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:29,402 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 13:24:29,477 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:29,682 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:29,692 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:29,762 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 13:24:29,836 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:30,047 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:30,062 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:30,131 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 13:24:30,206 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:30,404 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:30,415 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:30,521 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 13:24:30,595 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:30,789 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:30,809 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:30,879 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 13:24:30,953 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:31,161 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:31,171 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:31,240 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 13:24:31,315 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:31,763 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:31,773 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:31,845 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 13:24:31,919 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:32,117 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:32,129 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:32,199 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 13:24:32,274 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:32,472 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:32,482 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:32,588 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 13:24:32,662 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:32,869 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:32,883 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:32,953 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 13:24:32,954 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 13:24:32,978 INFO MainThread eval_mmteb_v2.py:306] [4/10] Evaluating: KorSarcasmClassification +[2026-05-18 13:24:32,980 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 13:24:35,886 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 13:24:35,888 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:24:36,469 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:44,631 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:44,636 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:45,792 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 13:24:46,245 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:46,441 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:46,446 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:47,558 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 13:24:48,010 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:48,224 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:48,232 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:49,340 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 13:24:49,786 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:49,973 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:49,979 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:51,412 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 13:24:51,946 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:52,150 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:52,154 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:53,278 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 13:24:53,727 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:53,915 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:53,918 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:55,028 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 13:24:55,479 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:55,681 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:55,688 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:56,822 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 13:24:57,274 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:57,472 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:57,480 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:24:58,924 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 13:24:59,451 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:24:59,650 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:24:59,657 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:25:00,824 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 13:25:01,274 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 13:25:01,469 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 13:25:01,476 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 13:25:02,625 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 13:25:02,627 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 13:25:02,657 INFO MainThread eval_mmteb_v2.py:306] [5/10] Evaluating: KorHateSpeechMLClassification +[2026-05-18 13:25:20,124 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 13:25:20,129 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 13:25:29,961 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 13:25:32,344 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 13:25:32,907 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 13:25:41,750 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 13:25:49,618 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-18 13:25:49,620 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-18 13:25:49,656 INFO MainThread eval_mmteb_v2.py:306] [6/10] Evaluating: SprintDuplicateQuestions +[2026-05-18 13:25:58,440 INFO MainThread abstask.py:176] Running task SprintDuplicateQuestions (split='test', hf_subset='default')... +[2026-05-18 13:25:58,667 INFO MainThread pair_classification_evaluator.py:90] Running pair classification - Encoding samples (1/2) +[2026-05-18 13:27:28,061 INFO MainThread pair_classification_evaluator.py:104] Running pair classification - Encoding samples (2/2) +[2026-05-18 13:29:22,194 INFO MainThread pair_classification_evaluator.py:119] Running pair classification - Evaluating pair similarity... +[2026-05-18 13:29:23,809 INFO MainThread pair_classification.py:122] Computing metrics... +[2026-05-18 13:29:25,068 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SprintDuplicateQuestions +[2026-05-18 13:29:25,100 INFO MainThread eval_mmteb_v2.py:306] [7/10] Evaluating: T2Reranking +[2026-05-18 13:29:28,527 INFO MainThread retrieval_dataset_loaders.py:160] Loading qrels... +[2026-05-18 13:29:35,454 INFO MainThread retrieval_dataset_loaders.py:191] Loaded 5908 DEV qrels. +[2026-05-18 13:29:35,455 INFO MainThread retrieval_dataset_loaders.py:130] Loading Corpus... +[2026-05-18 13:29:47,253 INFO MainThread retrieval_dataset_loaders.py:138] Loaded 97422 DEV Documents. +[2026-05-18 13:29:47,253 INFO MainThread retrieval_dataset_loaders.py:139] Doc Example: {'id': 'apositive_dev_query0_00000', 'text': '

【重新获取取件码】
1、首先来到丰巢快递柜前,点击屏幕上的【取快递】;

2、然后选择取件码取件;

3、在输入取件码的右下方,有一个【忘记取件码】,点击;

4、然后输入快递使用的手机号码,点击“获取验证码”,验证码输入后,点击【下一步】;

5、可以看到当前柜机中存放的快递信息,点击右上角的【取件】,将快递取出即可。
', 'title': ''} +[2026-05-18 13:29:47,253 INFO MainThread retrieval_dataset_loaders.py:143] Loading Queries... +[2026-05-18 13:29:52,966 INFO MainThread retrieval_dataset_loaders.py:154] Loaded 5908 DEV queries. +[2026-05-18 13:29:52,966 INFO MainThread retrieval_dataset_loaders.py:155] Query Example: {'id': 'dev_query0', 'text': '蜂巢取快递验证码摁错怎么办'} +[2026-05-18 13:29:53,323 INFO MainThread retrieval_dataset_loaders.py:195] Loading Top Ranked +[2026-05-18 13:29:57,703 INFO MainThread retrieval_dataset_loaders.py:215] Top ranked loaded: 5908 +[2026-05-18 13:29:57,703 INFO MainThread abstask.py:176] Running task T2Reranking (split='dev', hf_subset='default')... +[2026-05-18 13:29:58,084 INFO MainThread retrieval_evaluator.py:53] Running retrieval task - Indexing corpus... +[2026-05-18 13:29:58,084 INFO MainThread retrieval_evaluator.py:61] Running retrieval task - Searching queries... +[2026-05-18 13:30:04,740 INFO MainThread search_wrappers.py:130] Reranking pre-ranked documents... +[2026-05-18 13:45:49,395 INFO MainThread retrieval.py:390] Running retrieval task - Evaluating retrieval scores... +[2026-05-18 13:45:49,915 INFO MainThread retrieval.py:415] Running retrieval task - Finished. +[2026-05-18 13:45:49,924 INFO MainThread evaluate.py:481] ✓ Finished evaluation for T2Reranking +[2026-05-18 13:45:49,964 INFO MainThread eval_mmteb_v2.py:306] [8/10] Evaluating: SICK-R +[2026-05-18 13:45:53,160 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-18 13:45:53,289 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:04,044 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:14,398 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:14,521 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:14,546 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-18 13:46:14,576 INFO MainThread eval_mmteb_v2.py:306] [9/10] Evaluating: STSBenchmark +[2026-05-18 13:46:18,446 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 13:46:18,466 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:20,570 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:22,345 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:22,366 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:22,374 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 13:46:22,401 INFO MainThread eval_mmteb_v2.py:306] [10/10] Evaluating: STS17 +[2026-05-18 13:46:25,516 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-18 13:46:25,547 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:34,320 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:42,556 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:42,594 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:42,604 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-18 13:46:42,610 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:42,970 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:43,310 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:43,316 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:43,322 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-18 13:46:43,327 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:43,608 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:43,984 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:43,994 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:44,001 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-18 13:46:44,006 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:44,284 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:44,599 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:44,605 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:44,610 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-18 13:46:44,615 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:44,909 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:45,187 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:45,197 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:45,208 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-18 13:46:45,214 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:45,481 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:45,824 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:45,829 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:45,834 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-18 13:46:45,838 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:46,221 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:46,509 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:46,514 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:46,519 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-18 13:46:46,525 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:46,874 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:47,206 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:47,212 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:47,217 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-18 13:46:47,222 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:47,547 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:47,832 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:47,839 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:47,844 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-18 13:46:47,850 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:48,176 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:48,448 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:48,453 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:48,458 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-18 13:46:48,464 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 13:46:48,841 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 13:46:49,119 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 13:46:49,123 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 13:46:49,128 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-18 13:46:49,171 INFO MainThread eval_mmteb_v2.py:444] +============================================================ +[2026-05-18 13:46:49,172 INFO MainThread eval_mmteb_v2.py:445] Evaluating model type: f16 +[2026-05-18 13:46:49,172 INFO MainThread eval_mmteb_v2.py:446] ============================================================ +[2026-05-18 13:46:49,172 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-f16-gguf +[2026-05-18 13:46:49,172 INFO MainThread eval_mmteb_v2.py:306] [1/10] Evaluating: BornholmBitextMining +[2026-05-18 13:46:49,172 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 13:49:04,292 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 13:49:04,294 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 13:49:04,303 INFO MainThread abstask.py:608] Unloaded dataset BornholmBitextMining from memory. +[2026-05-18 13:49:04,303 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 13:49:04,304 INFO MainThread eval_mmteb_v2.py:306] [2/10] Evaluating: FinancialPhrasebankClassification +[2026-05-18 13:49:04,304 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 13:49:04,304 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 13:49:04,305 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 13:49:04,790 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:04,065 INFO MainThread eval_mmteb_v2.py:438] Will evaluate 9 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SprintDuplicateQuestions', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-18 14:05:04,065 INFO MainThread eval_mmteb_v2.py:446] +============================================================ +[2026-05-18 14:05:04,065 INFO MainThread eval_mmteb_v2.py:447] Evaluating model type: safetensors +[2026-05-18 14:05:04,065 INFO MainThread eval_mmteb_v2.py:448] ============================================================ +[2026-05-18 14:05:04,180 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 14:05:04,270 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 14:05:06,952 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 14:05:07,928 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 14:05:07,928 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 14:05:07,928 INFO MainThread eval_mmteb_v2.py:306] [1/9] Evaluating: BornholmBitextMining +[2026-05-18 14:05:11,152 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 14:05:15,846 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 14:05:15,860 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 14:05:15,871 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 14:05:15,901 INFO MainThread eval_mmteb_v2.py:306] [2/9] Evaluating: FinancialPhrasebankClassification +[2026-05-18 14:05:15,903 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 14:05:18,788 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 14:05:18,789 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:05:19,428 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:24,989 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:24,998 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:26,175 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:05:26,675 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:26,872 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:26,880 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:27,956 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:05:28,438 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:28,660 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:28,670 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:29,842 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:05:30,465 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:30,668 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:30,681 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:31,742 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:05:32,219 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:32,420 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:32,437 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:33,619 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:05:34,180 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:34,384 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:34,391 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:35,521 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:05:36,099 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:36,298 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:36,313 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:37,435 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:05:37,963 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:38,157 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:38,167 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:39,340 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:05:39,908 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:40,123 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:40,136 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:41,312 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:05:41,901 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:42,109 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:42,119 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:43,043 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 14:05:43,044 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 14:05:43,075 INFO MainThread eval_mmteb_v2.py:306] [3/9] Evaluating: PoemSentimentClassification +[2026-05-18 14:05:43,077 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 14:05:46,107 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 14:05:46,109 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:05:46,184 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:46,631 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:46,645 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:46,715 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:05:46,789 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:46,982 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:46,994 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:47,063 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:05:47,136 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:47,328 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:47,342 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:47,411 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:05:47,484 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:47,675 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:47,688 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:47,758 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:05:47,831 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:48,039 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:48,052 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:48,121 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:05:48,194 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:48,434 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:48,449 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:48,555 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:05:48,629 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:48,836 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:48,848 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:48,918 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:05:48,992 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:49,192 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:49,205 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:49,275 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:05:49,348 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:49,559 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:49,572 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:49,642 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:05:49,716 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:49,929 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:49,945 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:50,015 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 14:05:50,016 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 14:05:50,018 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:05:50,092 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:50,508 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:50,518 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:50,589 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:05:50,662 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:50,857 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:50,870 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:50,939 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:05:51,012 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:51,202 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:51,215 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:51,294 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:05:51,368 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:51,563 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:51,577 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:51,683 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:05:51,757 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:51,956 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:51,967 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:52,073 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:05:52,149 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:52,609 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:52,623 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:52,697 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:05:52,773 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:52,976 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:52,991 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:53,061 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:05:53,135 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:53,335 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:53,349 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:53,455 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:05:53,533 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:53,750 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:53,762 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:53,832 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:05:53,907 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:05:54,118 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:05:54,130 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:05:54,237 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 14:05:54,239 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 14:05:54,262 INFO MainThread eval_mmteb_v2.py:306] [4/9] Evaluating: KorSarcasmClassification +[2026-05-18 14:05:54,264 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 14:05:57,425 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 14:05:57,427 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:05:58,007 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:03,732 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:03,739 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:05,210 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:06:05,787 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:05,982 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:05,999 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:07,440 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:06:07,954 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:08,150 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:08,155 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:09,264 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:06:09,711 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:09,920 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:09,923 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:11,048 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:06:11,496 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:11,688 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:11,696 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:12,795 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:06:13,239 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:13,448 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:13,455 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:14,566 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:06:15,009 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:15,219 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:15,229 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:16,697 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:06:17,140 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:17,346 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:17,351 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:18,809 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:06:19,248 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:19,450 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:19,457 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:20,941 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:06:21,527 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:06:21,724 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:06:21,728 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:06:23,003 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 14:06:23,005 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 14:06:23,035 INFO MainThread eval_mmteb_v2.py:306] [5/9] Evaluating: KorHateSpeechMLClassification +[2026-05-18 14:06:26,030 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 14:06:26,035 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 14:06:35,934 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 14:06:38,322 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 14:06:38,755 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 14:06:47,554 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 14:06:56,007 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-18 14:06:56,009 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-18 14:06:56,051 INFO MainThread eval_mmteb_v2.py:306] [6/9] Evaluating: SprintDuplicateQuestions +[2026-05-18 14:07:00,281 INFO MainThread abstask.py:176] Running task SprintDuplicateQuestions (split='test', hf_subset='default')... +[2026-05-18 14:07:00,493 INFO MainThread pair_classification_evaluator.py:90] Running pair classification - Encoding samples (1/2) +[2026-05-18 14:08:29,131 INFO MainThread pair_classification_evaluator.py:104] Running pair classification - Encoding samples (2/2) +[2026-05-18 14:10:17,677 INFO MainThread pair_classification_evaluator.py:119] Running pair classification - Evaluating pair similarity... +[2026-05-18 14:10:20,487 INFO MainThread pair_classification.py:122] Computing metrics... +[2026-05-18 14:10:21,474 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SprintDuplicateQuestions +[2026-05-18 14:10:21,500 INFO MainThread eval_mmteb_v2.py:306] [7/9] Evaluating: SICK-R +[2026-05-18 14:10:24,452 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-18 14:10:24,579 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:10:34,952 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:10:45,135 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:10:45,339 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:10:45,413 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-18 14:10:45,453 INFO MainThread eval_mmteb_v2.py:306] [8/9] Evaluating: STSBenchmark +[2026-05-18 14:10:49,319 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 14:10:49,339 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:10:51,483 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:10:53,254 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:10:53,268 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:10:53,284 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 14:10:53,313 INFO MainThread eval_mmteb_v2.py:306] [9/9] Evaluating: STS17 +[2026-05-18 14:10:56,337 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-18 14:10:56,376 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:05,230 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:13,408 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:13,447 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:13,469 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-18 14:11:13,474 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:13,832 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:14,172 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:14,179 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:14,200 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-18 14:11:14,205 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:14,482 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:14,854 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:14,859 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:14,882 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-18 14:11:14,887 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:15,155 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:15,453 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:15,457 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:15,483 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-18 14:11:15,488 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:15,759 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:16,026 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:16,032 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:16,059 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-18 14:11:16,064 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:16,334 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:16,684 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:16,689 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:16,720 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-18 14:11:16,725 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:17,100 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:17,368 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:17,372 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:17,403 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-18 14:11:17,408 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:17,767 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:18,099 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:18,104 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:18,138 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-18 14:11:18,143 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:18,464 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:18,730 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:18,736 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:18,772 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-18 14:11:18,777 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:19,100 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:19,372 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:19,376 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:19,410 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-18 14:11:19,415 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 14:11:19,798 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 14:11:20,068 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 14:11:20,073 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 14:11:20,111 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-18 14:11:20,161 INFO MainThread eval_mmteb_v2.py:446] +============================================================ +[2026-05-18 14:11:20,161 INFO MainThread eval_mmteb_v2.py:447] Evaluating model type: f16 +[2026-05-18 14:11:20,161 INFO MainThread eval_mmteb_v2.py:448] ============================================================ +[2026-05-18 14:11:20,162 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-f16-gguf +[2026-05-18 14:11:20,162 INFO MainThread eval_mmteb_v2.py:306] [1/9] Evaluating: BornholmBitextMining +[2026-05-18 14:11:20,162 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 14:13:43,590 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 14:13:43,596 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 14:13:43,606 INFO MainThread abstask.py:608] Unloaded dataset BornholmBitextMining from memory. +[2026-05-18 14:13:43,606 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 14:13:43,607 INFO MainThread eval_mmteb_v2.py:306] [2/9] Evaluating: FinancialPhrasebankClassification +[2026-05-18 14:13:43,607 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 14:13:43,608 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 14:13:43,609 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:13:44,229 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:23:57,286 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:23:57,294 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:23:58,211 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:23:58,691 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:05,428 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:05,436 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:06,592 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:24:07,214 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:14,906 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:14,914 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:16,074 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:24:16,684 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:22,959 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:22,965 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:23,880 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:24:24,364 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:32,023 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:32,031 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:32,967 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:24:33,464 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:39,523 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:39,532 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:40,700 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:24:41,331 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:47,463 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:47,472 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:48,629 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:24:49,208 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:24:55,392 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:24:55,399 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:24:56,583 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:24:57,126 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:25:04,551 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:25:04,559 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:25:05,454 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:25:05,935 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:25:12,619 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:25:12,625 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:25:13,508 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 14:25:13,510 INFO MainThread abstask.py:608] Unloaded dataset FinancialPhrasebankClassification from memory. +[2026-05-18 14:25:13,510 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 14:25:13,511 INFO MainThread eval_mmteb_v2.py:306] [3/9] Evaluating: PoemSentimentClassification +[2026-05-18 14:25:13,511 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 14:25:13,511 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 14:25:13,513 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:25:13,569 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:25:39,830 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:25:39,838 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:25:39,895 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:25:39,954 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:25:46,049 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:25:46,058 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:25:46,136 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:25:46,210 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:25:52,219 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:25:52,227 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:25:52,284 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:25:52,342 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:25:58,164 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:25:58,174 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:25:58,245 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:25:58,319 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:04,062 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:04,070 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:04,141 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:26:04,214 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:09,542 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:09,551 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:09,621 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:26:09,695 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:15,179 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:15,188 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:15,245 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:26:15,303 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:20,910 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:20,919 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:20,989 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:26:21,064 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:25,807 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:25,817 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:25,887 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:26:25,961 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:31,591 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:31,602 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:31,673 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 14:26:31,674 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 14:26:31,676 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:26:31,750 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:26:56,645 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:26:56,655 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:26:56,725 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:26:56,800 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:02,528 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:02,537 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:02,607 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:27:02,680 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:08,246 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:08,255 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:08,326 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:27:08,399 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:14,321 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:14,330 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:14,401 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:27:14,474 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:20,253 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:20,261 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:20,331 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:27:20,405 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:25,964 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:25,971 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:26,028 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:27:26,085 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:32,133 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:32,144 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:32,200 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:27:32,259 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:38,505 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:38,514 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:38,584 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:27:38,658 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:44,489 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:44,497 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:44,553 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:27:44,611 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:27:50,071 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:27:50,080 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:27:50,138 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 14:27:50,140 INFO MainThread abstask.py:608] Unloaded dataset PoemSentimentClassification from memory. +[2026-05-18 14:27:50,140 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 14:27:50,141 INFO MainThread eval_mmteb_v2.py:306] [4/9] Evaluating: KorSarcasmClassification +[2026-05-18 14:27:50,141 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 14:27:50,141 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 14:27:50,142 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 14:27:50,583 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:09,510 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:09,515 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:11,001 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 14:37:11,582 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:14,974 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:14,979 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:16,390 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 14:37:16,831 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:19,934 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:19,938 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:21,039 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 14:37:21,488 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:25,250 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:25,254 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:26,403 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 14:37:26,847 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:30,596 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:30,604 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:31,763 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 14:37:32,202 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:35,530 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:35,535 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:36,686 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 14:37:37,127 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:40,494 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:40,500 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:41,651 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 14:37:42,093 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:46,130 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:46,135 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:47,293 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 14:37:47,735 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:52,276 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:52,281 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:53,408 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 14:37:53,848 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 14:37:57,909 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 14:37:57,914 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 14:37:59,095 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 14:37:59,097 INFO MainThread abstask.py:608] Unloaded dataset KorSarcasmClassification from memory. +[2026-05-18 14:37:59,097 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 14:37:59,098 INFO MainThread eval_mmteb_v2.py:306] [5/9] Evaluating: KorHateSpeechMLClassification +[2026-05-18 14:37:59,098 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 14:37:59,102 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 14:38:07,940 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 14:40:24,470 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 14:40:25,034 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 14:49:09,950 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 14:49:16,442 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-18 14:49:16,445 INFO MainThread abstask.py:608] Unloaded dataset KorHateSpeechMLClassification from memory. +[2026-05-18 14:49:16,446 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-18 14:49:16,446 INFO MainThread eval_mmteb_v2.py:306] [6/9] Evaluating: SprintDuplicateQuestions +[2026-05-18 14:49:16,447 INFO MainThread abstask.py:176] Running task SprintDuplicateQuestions (split='test', hf_subset='default')... +[2026-05-18 14:49:16,660 INFO MainThread pair_classification_evaluator.py:90] Running pair classification - Encoding samples (1/2) +[2026-05-18 18:14:27,963 INFO MainThread eval_mmteb_v2.py:438] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-18 18:14:27,964 INFO MainThread eval_mmteb_v2.py:446] +============================================================ +[2026-05-18 18:14:27,964 INFO MainThread eval_mmteb_v2.py:447] Evaluating model type: safetensors +[2026-05-18 18:14:27,964 INFO MainThread eval_mmteb_v2.py:448] ============================================================ +[2026-05-18 18:14:28,072 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 18:14:28,145 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 18:14:31,381 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 18:14:32,372 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 18:14:32,372 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 18:14:32,372 INFO MainThread eval_mmteb_v2.py:306] [1/8] Evaluating: BornholmBitextMining +[2026-05-18 18:14:35,530 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 18:14:40,230 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 18:14:40,244 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 18:14:40,256 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 18:14:40,287 INFO MainThread eval_mmteb_v2.py:306] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-18 18:14:40,290 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 18:14:43,262 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 18:14:43,263 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:14:43,898 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:14:49,546 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:14:49,555 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:14:50,689 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:14:51,314 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:14:51,525 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:14:51,536 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:14:52,669 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:14:53,303 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:14:53,521 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:14:53,534 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:14:54,733 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:14:55,365 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:14:55,573 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:14:55,587 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:14:56,843 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:14:57,468 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:14:57,710 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:14:57,722 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:14:58,900 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:14:59,545 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:14:59,769 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:14:59,792 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:00,934 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:15:01,569 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:01,790 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:01,802 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:02,969 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:15:03,621 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:03,847 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:03,859 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:04,964 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:15:05,455 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:05,667 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:05,677 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:06,600 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:15:07,084 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:07,295 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:07,310 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:08,488 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 18:15:08,491 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 18:15:08,529 INFO MainThread eval_mmteb_v2.py:306] [3/8] Evaluating: PoemSentimentClassification +[2026-05-18 18:15:08,535 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 18:15:11,560 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 18:15:11,563 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:15:11,639 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:12,095 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:12,106 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:12,176 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:15:12,250 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:12,470 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:12,484 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:12,555 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:15:12,630 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:12,841 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:12,852 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:12,923 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:15:12,997 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:13,230 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:13,243 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:13,314 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:15:13,389 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:13,601 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:13,615 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:13,685 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:15:13,761 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:13,961 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:13,973 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:14,043 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:15:14,118 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:14,352 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:14,367 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:14,437 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:15:14,512 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:14,742 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:14,758 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:14,828 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:15:14,903 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:15,122 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:15,135 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:15,206 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:15:15,282 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:15,490 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:15,503 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:15,574 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 18:15:15,574 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 18:15:15,576 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:15:15,650 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:16,064 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:16,074 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:16,143 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:15:16,217 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:16,414 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:16,426 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:16,494 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:15:16,567 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:17,171 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:17,185 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:17,255 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:15:17,329 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:17,537 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:17,552 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:17,657 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:15:17,730 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:17,940 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:17,950 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:18,028 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:15:18,101 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:18,306 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:18,322 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:18,392 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:15:18,466 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:18,670 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:18,682 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:18,752 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:15:18,826 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:19,036 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:19,047 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:19,117 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:15:19,195 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:19,398 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:19,412 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:19,482 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:15:19,561 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:19,770 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:19,780 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:19,851 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 18:15:19,852 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 18:15:19,876 INFO MainThread eval_mmteb_v2.py:306] [4/8] Evaluating: KorSarcasmClassification +[2026-05-18 18:15:19,877 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 18:15:22,774 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 18:15:22,776 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:15:23,280 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:28,991 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:28,996 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:30,279 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:15:30,734 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:30,949 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:30,954 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:32,069 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:15:32,513 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:32,723 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:32,729 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:34,151 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:15:34,723 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:34,932 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:34,940 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:36,039 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:15:36,484 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:36,691 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:36,699 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:38,113 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:15:38,687 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:38,881 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:38,890 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:40,003 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:15:40,450 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:40,667 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:40,678 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:42,129 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:15:42,702 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:42,916 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:42,922 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:44,387 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:15:44,938 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:45,192 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:45,202 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:46,661 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:15:47,193 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:15:47,384 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:15:47,391 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:15:48,505 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 18:15:48,507 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 18:15:48,538 INFO MainThread eval_mmteb_v2.py:306] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-18 18:15:51,691 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 18:15:51,695 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 18:16:02,014 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 18:16:04,425 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 18:16:05,001 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 18:16:13,853 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 18:16:20,667 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-18 18:16:20,669 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-18 18:16:20,704 INFO MainThread eval_mmteb_v2.py:306] [6/8] Evaluating: SICK-R +[2026-05-18 18:16:23,573 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-18 18:16:23,684 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:16:33,832 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:16:44,248 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:16:44,481 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:16:44,561 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-18 18:16:44,593 INFO MainThread eval_mmteb_v2.py:306] [7/8] Evaluating: STSBenchmark +[2026-05-18 18:16:48,461 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 18:16:48,481 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:16:50,327 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:16:52,096 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:16:52,115 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:16:52,131 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 18:16:52,169 INFO MainThread eval_mmteb_v2.py:306] [8/8] Evaluating: STS17 +[2026-05-18 18:16:54,155 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-18 18:16:54,193 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:03,063 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:11,626 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:11,676 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:11,705 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-18 18:17:11,710 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:12,069 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:12,409 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:12,414 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:12,437 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-18 18:17:12,442 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:12,720 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:13,093 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:13,100 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:13,125 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-18 18:17:13,130 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:13,404 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:13,725 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:13,730 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:13,765 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-18 18:17:13,775 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:14,070 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:14,345 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:14,351 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:14,379 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-18 18:17:14,384 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:14,654 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:14,993 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:15,000 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:15,029 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-18 18:17:15,034 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:15,418 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:15,688 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:15,693 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:15,723 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-18 18:17:15,727 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:16,070 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:16,401 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:16,406 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:16,440 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-18 18:17:16,444 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:16,755 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:17,016 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:17,024 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:17,058 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-18 18:17:17,063 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:17,394 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:17,667 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:17,672 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:17,707 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-18 18:17:17,712 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 18:17:18,084 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 18:17:18,359 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 18:17:18,364 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 18:17:18,402 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-18 18:17:18,445 INFO MainThread eval_mmteb_v2.py:446] +============================================================ +[2026-05-18 18:17:18,445 INFO MainThread eval_mmteb_v2.py:447] Evaluating model type: f16 +[2026-05-18 18:17:18,445 INFO MainThread eval_mmteb_v2.py:448] ============================================================ +[2026-05-18 18:17:18,445 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-f16-gguf +[2026-05-18 18:17:18,446 INFO MainThread eval_mmteb_v2.py:306] [1/8] Evaluating: BornholmBitextMining +[2026-05-18 18:17:18,446 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 18:19:35,026 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 18:19:35,032 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 18:19:35,043 INFO MainThread abstask.py:608] Unloaded dataset BornholmBitextMining from memory. +[2026-05-18 18:19:35,043 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 18:19:35,044 INFO MainThread eval_mmteb_v2.py:306] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-18 18:19:35,044 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 18:19:35,045 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 18:19:35,046 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:19:35,680 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:29:44,846 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:29:44,856 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:29:46,030 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:29:46,653 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:29:53,043 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:29:53,051 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:29:54,168 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:29:54,792 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:02,454 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:02,461 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:03,321 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:30:03,798 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:10,203 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:10,211 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:11,329 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:30:11,817 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:18,604 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:18,613 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:19,744 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:30:20,346 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:26,173 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:26,181 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:27,301 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:30:27,821 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:33,407 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:33,414 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:34,317 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:30:34,792 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:40,591 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:40,597 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:41,502 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:30:41,980 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:48,947 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:48,955 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:50,114 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:30:50,649 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:30:56,722 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:30:56,730 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:30:57,906 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 18:30:57,908 INFO MainThread abstask.py:608] Unloaded dataset FinancialPhrasebankClassification from memory. +[2026-05-18 18:30:57,908 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 18:30:57,909 INFO MainThread eval_mmteb_v2.py:306] [3/8] Evaluating: PoemSentimentClassification +[2026-05-18 18:30:57,909 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 18:30:57,910 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 18:30:57,912 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:30:57,987 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:22,882 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:22,892 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:22,963 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:31:23,037 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:29,034 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:29,044 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:29,114 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:31:29,189 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:34,536 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:34,545 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:34,615 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:31:34,690 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:40,020 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:40,030 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:40,101 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:31:40,176 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:45,590 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:45,599 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:45,670 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:31:45,744 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:51,628 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:51,637 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:51,717 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:31:51,792 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:31:57,472 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:31:57,482 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:31:57,554 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:31:57,628 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:03,378 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:03,388 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:03,458 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:32:03,533 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:08,886 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:08,906 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:08,977 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:32:09,051 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:14,916 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:14,927 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:14,998 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 18:32:14,999 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 18:32:15,001 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:32:15,076 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:39,970 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:39,982 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:40,052 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:32:40,127 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:45,453 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:45,461 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:45,531 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:32:45,606 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:50,724 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:50,731 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:50,788 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:32:50,846 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:32:55,989 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:32:55,998 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:32:56,055 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:32:56,113 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:33:01,398 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:33:01,407 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:33:01,477 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:33:01,551 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:33:07,009 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:33:07,018 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:33:07,090 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:33:07,164 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:33:12,604 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:33:12,614 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:33:12,684 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:33:12,757 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:33:18,208 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:33:18,217 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:33:18,288 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:33:18,363 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:33:23,904 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:33:23,913 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:33:23,982 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:33:24,058 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:33:29,576 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:33:29,586 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:33:29,657 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 18:33:29,658 INFO MainThread abstask.py:608] Unloaded dataset PoemSentimentClassification from memory. +[2026-05-18 18:33:29,659 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 18:33:29,660 INFO MainThread eval_mmteb_v2.py:306] [4/8] Evaluating: KorSarcasmClassification +[2026-05-18 18:33:29,660 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 18:33:29,660 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 18:33:29,662 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 18:33:30,241 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:42:35,783 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:42:35,788 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:42:37,211 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 18:42:37,784 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:42:41,188 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:42:41,195 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:42:42,337 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 18:42:42,775 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:42:45,877 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:42:45,883 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:42:47,032 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 18:42:47,474 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:42:50,984 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:42:50,991 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:42:52,247 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 18:42:52,684 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:42:56,319 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:42:56,326 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:42:57,470 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 18:42:57,910 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:43:00,969 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:43:00,977 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:43:02,126 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 18:43:02,568 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:43:05,801 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:43:05,806 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:43:06,954 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 18:43:07,395 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:43:11,372 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:43:11,377 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:43:12,499 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 18:43:12,940 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:43:18,197 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:43:18,203 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:43:19,649 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 18:43:20,224 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 18:43:23,753 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 18:43:23,758 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 18:43:25,152 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 18:43:25,154 INFO MainThread abstask.py:608] Unloaded dataset KorSarcasmClassification from memory. +[2026-05-18 18:43:25,154 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 18:43:25,155 INFO MainThread eval_mmteb_v2.py:306] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-18 18:43:25,155 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 18:43:25,159 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 18:43:34,234 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 18:45:44,822 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 18:45:45,388 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 18:54:06,472 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 18:54:11,807 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-18 18:54:11,810 INFO MainThread abstask.py:608] Unloaded dataset KorHateSpeechMLClassification from memory. +[2026-05-18 18:54:11,810 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-18 18:54:11,811 INFO MainThread eval_mmteb_v2.py:306] [6/8] Evaluating: SICK-R +[2026-05-18 18:54:11,811 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-18 18:54:11,909 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:11:15,677 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:28:32,046 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:28:32,259 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:28:32,335 INFO MainThread abstask.py:608] Unloaded dataset SICK-R from memory. +[2026-05-18 19:28:32,336 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-18 19:28:32,337 INFO MainThread eval_mmteb_v2.py:306] [7/8] Evaluating: STSBenchmark +[2026-05-18 19:28:32,338 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 19:28:32,358 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:31:19,399 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:34:01,221 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:34:01,240 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:34:01,256 INFO MainThread abstask.py:608] Unloaded dataset STSBenchmark from memory. +[2026-05-18 19:34:01,257 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 19:34:01,258 INFO MainThread eval_mmteb_v2.py:306] [8/8] Evaluating: STS17 +[2026-05-18 19:34:01,258 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-18 19:34:01,296 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:42:45,522 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:51:33,922 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:51:33,968 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:51:33,995 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-18 19:51:34,001 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:52:06,421 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:52:40,290 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:52:40,294 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:52:40,319 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-18 19:52:40,327 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:53:05,087 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:53:37,242 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:53:37,246 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:53:37,271 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-18 19:53:37,276 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:54:01,555 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:54:31,122 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:54:31,125 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:54:31,155 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-18 19:54:31,162 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:54:56,272 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:55:22,365 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:55:22,369 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:55:22,397 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-18 19:55:22,402 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:55:48,533 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:56:22,754 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:56:22,758 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:56:22,788 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-18 19:56:22,793 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:56:56,002 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:57:21,773 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:57:21,777 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:57:21,808 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-18 19:57:21,814 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:57:53,909 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:58:26,804 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:58:26,808 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:58:26,835 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-18 19:58:26,840 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 19:58:59,727 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 19:59:24,830 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 19:59:24,834 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 19:59:24,868 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-18 19:59:24,873 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 20:00:00,219 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 20:00:25,832 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 20:00:25,837 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 20:00:25,866 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-18 20:00:25,870 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 20:00:57,973 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 20:01:22,543 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 20:01:22,547 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 20:01:22,578 INFO MainThread abstask.py:608] Unloaded dataset STS17 from memory. +[2026-05-18 20:01:22,579 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-18 20:01:22,581 INFO MainThread eval_mmteb_v2.py:446] +============================================================ +[2026-05-18 20:01:22,581 INFO MainThread eval_mmteb_v2.py:447] Evaluating model type: i2s +[2026-05-18 20:01:22,581 INFO MainThread eval_mmteb_v2.py:448] ============================================================ +[2026-05-18 20:01:22,581 INFO MainThread eval_mmteb_v2.py:302] Loaded model: microsoft/bitnet-embeddings-0.6b-i2s-gguf +[2026-05-18 20:01:22,581 INFO MainThread eval_mmteb_v2.py:306] [1/8] Evaluating: BornholmBitextMining +[2026-05-18 20:01:24,821 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 20:03:14,532 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 20:03:14,538 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 20:03:14,548 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 20:03:14,549 INFO MainThread eval_mmteb_v2.py:306] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-18 20:03:14,549 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 20:03:16,463 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 20:03:16,465 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 20:03:17,093 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:11:33,699 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:11:33,710 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:11:34,879 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 20:11:35,409 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:11:40,511 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:11:40,517 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:11:41,419 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 20:11:41,894 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:11:48,743 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:11:48,753 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:11:49,924 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 20:11:50,513 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:11:55,735 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:11:55,745 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:11:56,921 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 20:11:57,546 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:03,079 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:03,086 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:03,999 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 20:12:04,476 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:08,835 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:08,841 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:09,752 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 20:12:10,230 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:14,910 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:14,918 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:16,088 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 20:12:16,712 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:21,211 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:21,220 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:22,387 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 20:12:22,946 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:28,210 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:28,217 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:29,124 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 20:12:29,600 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:34,194 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:34,204 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:35,105 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 20:12:35,107 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 20:12:35,108 INFO MainThread eval_mmteb_v2.py:306] [3/8] Evaluating: PoemSentimentClassification +[2026-05-18 20:12:35,108 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 20:12:37,365 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 20:12:37,367 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 20:12:37,443 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:12:57,997 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:12:58,008 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:12:58,065 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 20:12:58,126 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:02,852 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:02,860 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:02,915 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 20:13:02,972 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:07,288 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:07,296 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:07,353 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 20:13:07,410 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:12,002 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:12,015 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:12,085 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 20:13:12,160 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:16,741 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:16,750 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:16,826 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 20:13:16,902 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:21,572 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:21,581 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:21,638 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 20:13:21,696 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:26,380 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:26,388 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:26,444 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 20:13:26,503 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:30,997 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:31,006 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:31,079 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 20:13:31,153 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:35,629 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:35,638 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:35,709 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 20:13:35,786 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:13:40,675 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:13:40,684 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:13:40,743 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 20:13:40,743 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 20:13:40,745 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 20:13:40,807 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:01,242 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:01,253 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:01,322 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 20:14:01,397 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:05,984 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:05,990 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:06,057 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 20:14:06,116 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:10,278 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:10,286 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:10,343 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 20:14:10,401 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:14,872 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:14,881 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:14,938 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 20:14:14,996 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:19,892 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:19,900 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:19,956 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 20:14:20,014 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:24,243 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:24,254 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:24,337 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 20:14:24,416 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:29,211 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:29,218 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:29,275 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 20:14:29,333 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:34,372 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:34,382 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:34,453 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 20:14:34,527 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:39,013 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:39,022 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:39,092 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 20:14:39,166 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:14:44,212 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:14:44,221 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:14:44,308 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 20:14:44,310 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 20:14:44,311 INFO MainThread eval_mmteb_v2.py:306] [4/8] Evaluating: KorSarcasmClassification +[2026-05-18 20:14:44,312 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 20:14:46,267 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 20:14:46,270 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 20:14:46,715 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:09,916 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:09,922 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:11,012 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 20:22:11,452 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:14,030 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:14,034 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:15,124 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 20:22:15,562 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:17,942 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:17,952 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:19,086 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 20:22:19,524 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:22,626 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:22,636 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:24,050 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 20:22:24,547 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:27,420 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:27,427 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:28,838 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 20:22:29,284 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:31,897 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:31,902 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:32,992 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 20:22:33,431 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:35,904 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:35,909 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:37,019 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 20:22:37,459 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:40,641 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:40,649 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:41,741 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 20:22:42,186 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:45,613 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:45,618 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:46,756 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 20:22:47,206 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 20:22:50,074 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 20:22:50,081 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 20:22:51,186 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 20:22:51,187 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 20:22:51,188 INFO MainThread eval_mmteb_v2.py:306] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-18 20:22:56,527 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 20:22:56,531 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 20:23:06,586 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 20:24:52,847 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 20:24:53,408 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 20:31:48,421 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 20:31:54,140 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-18 20:31:54,142 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-18 20:31:54,143 INFO MainThread eval_mmteb_v2.py:306] [6/8] Evaluating: SICK-R +[2026-05-18 20:31:56,696 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-18 20:31:56,799 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 20:45:30,832 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 20:58:45,528 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 20:58:45,722 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 20:58:45,798 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-18 20:58:45,800 INFO MainThread eval_mmteb_v2.py:306] [7/8] Evaluating: STSBenchmark +[2026-05-18 20:58:49,621 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 20:58:49,642 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:00:58,250 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:03:06,797 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:03:06,813 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:03:06,828 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 21:03:06,829 INFO MainThread eval_mmteb_v2.py:306] [8/8] Evaluating: STS17 +[2026-05-18 21:03:09,125 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-18 21:03:09,162 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:10:00,312 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:17:01,668 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:17:01,713 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:17:01,741 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-18 21:17:01,747 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:17:27,517 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:17:53,498 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:17:53,504 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:17:53,528 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-18 21:17:53,533 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:18:13,269 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:18:39,717 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:18:39,721 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:18:39,741 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-18 21:18:39,745 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:18:59,946 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:19:24,488 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:19:24,492 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:19:24,513 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-18 21:19:24,518 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:19:44,553 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:20:04,898 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:20:04,903 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:20:04,932 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-18 21:20:04,937 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:20:24,223 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:20:50,310 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:20:50,314 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:20:50,344 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-18 21:20:50,349 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:21:15,119 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:21:34,414 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:21:34,420 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:21:34,451 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-18 21:21:34,456 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:21:59,326 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:22:25,682 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:22:25,687 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:22:25,713 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-18 21:22:25,717 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:22:52,616 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:23:12,734 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:23:12,738 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:23:12,765 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-18 21:23:12,769 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:23:40,152 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:23:59,420 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:23:59,424 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:23:59,452 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-18 21:23:59,456 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 21:24:25,848 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 21:24:46,230 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 21:24:46,235 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 21:24:46,273 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-18 21:24:46,584 INFO MainThread eval_mmteb_v2.py:494] Done! +[2026-05-18 22:18:31,333 INFO MainThread simple_encoder.py:65] Use varlen batching: True +[2026-05-18 22:19:07,964 INFO MainThread simple_encoder.py:65] Use varlen batching: True +[2026-05-18 22:19:08,248 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 22:19:11,066 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 22:19:12,304 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=1 +[2026-05-18 22:19:12,322 INFO MainThread data_utils.py:81] Avg length: 9, Max length: 9, 0.0k tokens, no truncation. +[2026-05-18 22:29:12,280 INFO MainThread eval_mmteb_v2.py:458] Will evaluate 1 tasks: ['BornholmBitextMining'] +[2026-05-18 22:29:12,280 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:29:12,280 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: safetensors +[2026-05-18 22:29:12,280 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:29:12,397 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 22:29:12,467 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 22:29:15,134 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 22:29:16,124 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 22:29:16,125 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 22:29:16,125 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: BornholmBitextMining +[2026-05-18 22:29:16,127 INFO MainThread evaluate.py:424] Results for BornholmBitextMining already exist in cache. Skipping evaluation and loading results. +[2026-05-18 22:29:16,127 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:29:16,127 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: safetensors_noquant +[2026-05-18 22:29:16,127 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:29:16,128 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 22:29:16,186 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 22:29:18,585 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 22:29:19,266 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 22:29:19,266 INFO MainThread eval_mmteb_v2.py:212] BitLinear quantization DISABLED (no-quant mode for F16 comparison) +[2026-05-18 22:29:19,266 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors-noquant +[2026-05-18 22:29:19,266 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: BornholmBitextMining +[2026-05-18 22:29:22,457 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 22:29:25,841 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 22:29:25,851 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 22:29:25,860 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: f16 +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-f16-gguf +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: BornholmBitextMining +[2026-05-18 22:29:25,892 INFO MainThread evaluate.py:424] Results for BornholmBitextMining already exist in cache. Skipping evaluation and loading results. +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: i2s +[2026-05-18 22:29:25,892 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:29:25,893 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-i2s-gguf +[2026-05-18 22:29:25,893 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: BornholmBitextMining +[2026-05-18 22:29:25,893 INFO MainThread evaluate.py:424] Results for BornholmBitextMining already exist in cache. Skipping evaluation and loading results. +[2026-05-18 22:29:25,893 INFO MainThread eval_mmteb_v2.py:529] Done! +[2026-05-18 22:30:59,662 INFO MainThread eval_mmteb_v2.py:458] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-18 22:30:59,662 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:30:59,662 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: safetensors +[2026-05-18 22:30:59,662 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:30:59,774 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 22:30:59,844 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 22:31:02,644 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 22:31:03,590 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 22:31:03,591 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 22:31:03,591 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-18 22:31:07,846 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 22:31:12,487 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 22:31:12,499 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 22:31:12,508 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 22:31:12,538 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-18 22:31:12,539 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 22:31:15,610 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-18 22:31:15,611 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 22:31:16,244 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:21,806 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:21,817 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:22,986 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 22:31:23,608 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:23,815 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:23,827 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:24,969 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 22:31:25,462 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:25,656 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:25,662 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:26,560 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 22:31:27,043 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:27,246 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:27,256 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:28,141 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 22:31:28,628 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:28,837 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:28,848 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:29,991 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 22:31:30,476 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:30,676 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:30,686 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:31,833 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 22:31:32,321 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:32,516 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:32,528 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:33,458 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 22:31:33,937 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:34,140 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:34,148 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:35,306 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 22:31:35,932 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:36,141 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:36,151 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:37,281 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 22:31:37,765 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:37,965 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:37,974 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:39,077 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-18 22:31:39,079 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-18 22:31:39,112 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-18 22:31:39,114 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 22:31:42,116 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-18 22:31:42,118 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 22:31:42,193 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:42,665 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:42,678 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:42,749 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 22:31:42,828 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:43,027 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:43,038 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:43,143 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 22:31:43,217 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:43,412 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:43,427 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:43,532 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 22:31:43,606 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:43,797 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:43,807 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:43,912 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 22:31:43,986 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:44,191 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:44,204 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:44,275 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 22:31:44,348 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:44,544 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:44,558 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:44,663 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 22:31:44,736 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:44,928 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:44,940 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:45,012 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 22:31:45,091 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:45,300 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:45,312 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:45,395 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 22:31:45,471 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:45,658 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:45,669 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:45,740 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 22:31:45,814 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:46,007 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:46,017 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:46,088 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 22:31:46,088 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-18 22:31:46,090 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 22:31:46,163 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:46,565 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:46,578 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:46,648 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 22:31:46,722 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:46,924 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:46,938 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:47,008 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 22:31:47,081 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:47,288 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:47,302 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:47,372 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 22:31:47,445 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:47,651 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:47,664 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:47,734 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 22:31:47,808 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:48,228 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:48,244 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:48,350 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 22:31:48,424 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:48,608 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:48,616 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:48,701 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 22:31:48,764 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:48,961 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:48,972 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:49,042 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 22:31:49,116 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:49,335 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:49,350 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:49,420 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 22:31:49,499 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:49,695 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:49,708 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:49,807 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 22:31:49,889 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:50,090 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:50,105 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:31:50,176 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-18 22:31:50,177 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-18 22:31:50,201 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-18 22:31:50,202 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-18 22:31:53,109 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-18 22:31:53,111 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-18 22:31:53,692 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:31:59,292 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:31:59,297 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:00,758 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-18 22:32:01,226 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:01,419 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:01,423 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:02,605 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-18 22:32:03,046 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:03,246 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:03,255 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:04,715 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-18 22:32:05,160 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:05,364 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:05,372 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:06,819 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-18 22:32:07,391 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:07,591 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:07,600 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:09,114 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-18 22:32:09,697 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:09,896 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:09,905 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:11,193 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-18 22:32:11,638 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:11,890 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:11,900 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:13,386 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-18 22:32:13,963 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:14,169 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:14,176 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:15,421 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-18 22:32:15,879 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:16,144 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:16,185 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:17,814 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-18 22:32:18,390 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-18 22:32:18,606 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-18 22:32:18,641 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-18 22:32:20,135 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-18 22:32:20,137 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-18 22:32:25,482 INFO MainThread eval_mmteb_v2.py:322] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-18 22:32:28,407 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-18 22:32:28,412 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-18 22:32:38,818 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-18 22:33:00,139 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-18 22:33:00,705 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-18 22:33:18,630 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-18 22:46:40,452 INFO MainThread eval_mmteb_v2.py:458] Will evaluate 1 tasks: ['STSBenchmark'] +[2026-05-18 22:46:40,453 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:46:40,453 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: safetensors +[2026-05-18 22:46:40,453 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:46:40,543 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 22:46:40,613 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 22:46:43,718 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 22:46:44,809 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 22:46:44,810 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-18 22:46:44,810 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: STSBenchmark +[2026-05-18 22:46:49,109 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 22:46:49,130 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 22:46:52,033 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 22:46:53,831 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 22:46:53,851 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 22:46:53,868 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 22:46:53,896 INFO MainThread eval_mmteb_v2.py:529] Done! +[2026-05-18 22:47:29,464 INFO MainThread eval_mmteb_v2.py:458] Will evaluate 1 tasks: ['STSBenchmark'] +[2026-05-18 22:47:29,464 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 22:47:29,464 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: i2s +[2026-05-18 22:47:29,464 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 22:47:29,464 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-i2s-gguf +[2026-05-18 22:47:29,464 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: STSBenchmark +[2026-05-18 22:47:33,827 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 22:47:33,848 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 22:49:47,983 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 22:52:04,534 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 22:52:05,862 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 22:52:05,886 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 22:52:05,900 INFO MainThread eval_mmteb_v2.py:529] Done! +[2026-05-18 22:57:38,756 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 22:57:39,023 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 22:57:42,119 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 22:57:43,280 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=8 +[2026-05-18 23:00:48,316 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:00:48,581 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:00:51,701 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:00:52,966 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=8 +[2026-05-18 23:01:22,663 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:01:22,927 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:01:26,452 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:01:27,685 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=8 +[2026-05-18 23:01:56,349 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:01:56,558 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:01:59,223 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:02:00,317 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=mean, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=8 +[2026-05-18 23:02:21,909 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:02:22,174 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:02:25,001 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:02:26,161 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=mean, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=8 +[2026-05-18 23:05:07,983 INFO MainThread eval_mmteb_v2.py:458] Will evaluate 1 tasks: ['STSBenchmark'] +[2026-05-18 23:05:07,983 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 23:05:07,983 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: i2s +[2026-05-18 23:05:07,983 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 23:05:07,984 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-i2s-gguf +[2026-05-18 23:05:07,984 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: STSBenchmark +[2026-05-18 23:05:12,154 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 23:05:12,172 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 23:07:20,523 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 23:09:28,297 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 23:09:28,679 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 23:09:28,695 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 23:09:28,697 INFO MainThread eval_mmteb_v2.py:529] Done! +[2026-05-18 23:10:01,158 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:10:01,424 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:10:04,421 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:10:05,692 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=avg, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 23:10:38,158 INFO MainThread eval_mmteb_v2.py:458] Will evaluate 1 tasks: ['STSBenchmark'] +[2026-05-18 23:10:38,158 INFO MainThread eval_mmteb_v2.py:470] +============================================================ +[2026-05-18 23:10:38,158 INFO MainThread eval_mmteb_v2.py:471] Evaluating model type: safetensors +[2026-05-18 23:10:38,158 INFO MainThread eval_mmteb_v2.py:472] ============================================================ +[2026-05-18 23:10:38,271 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:10:38,343 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:10:41,175 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:10:42,166 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=avg, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 23:10:42,166 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors-avg +[2026-05-18 23:10:42,166 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: STSBenchmark +[2026-05-18 23:10:46,244 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-18 23:10:46,264 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-18 23:10:49,049 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-18 23:10:50,854 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-18 23:10:50,877 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-18 23:10:50,894 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-18 23:10:50,923 INFO MainThread eval_mmteb_v2.py:529] Done! +[2026-05-18 23:23:25,891 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:23:26,180 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-18 23:23:29,347 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-18 23:23:30,501 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-18 23:32:49,238 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:33:05,091 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-18 23:37:19,183 INFO MainThread eval_mmteb_v2.py:443] Will evaluate 1 tasks: ['BornholmBitextMining'] +[2026-05-18 23:37:19,184 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-18 23:37:19,184 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: i2s +[2026-05-18 23:37:19,184 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-18 23:37:19,184 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-i2s-gguf +[2026-05-18 23:37:19,184 INFO MainThread eval_mmteb_v2.py:322] [1/1] Evaluating: BornholmBitextMining +[2026-05-18 23:37:22,386 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-18 23:39:21,643 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-18 23:39:21,820 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-18 23:39:21,831 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-18 23:39:21,833 INFO MainThread eval_mmteb_v2.py:504] Done! +[2026-05-19 00:05:31,215 INFO MainThread eval_mmteb_v2.py:443] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-19 00:05:31,216 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:05:31,216 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: safetensors +[2026-05-19 00:05:31,216 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:05:31,329 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-19 00:05:31,401 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-19 00:05:34,651 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-19 00:05:35,817 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-19 00:05:35,818 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-19 00:05:35,818 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:05:40,191 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 00:05:44,803 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-19 00:05:44,814 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-19 00:05:44,824 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-19 00:05:44,854 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-19 00:05:44,856 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:05:47,837 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-19 00:05:47,838 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:05:48,473 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:05:54,085 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:05:54,094 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:05:55,222 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:05:55,845 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:05:56,047 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:05:56,056 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:05:57,194 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:05:57,816 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:05:58,015 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:05:58,026 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:05:59,141 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:05:59,710 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:05:59,904 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:05:59,916 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:00,792 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:06:01,284 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:01,492 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:01,506 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:02,666 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:06:03,295 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:03,534 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:03,544 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:04,576 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:06:05,057 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:05,256 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:05,269 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:06,427 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:06:07,050 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:07,259 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:07,270 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:08,159 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:06:08,645 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:08,870 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:08,879 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:09,827 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:06:10,321 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:10,528 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:10,541 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:11,489 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-19 00:06:11,490 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-19 00:06:11,522 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-19 00:06:11,524 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:06:14,459 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-19 00:06:14,461 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:06:14,521 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:14,968 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:14,981 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:15,050 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:06:15,124 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:15,316 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:15,330 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:15,400 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:06:15,473 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:15,673 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:15,686 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:15,755 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:06:15,829 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:16,029 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:16,041 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:16,111 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:06:16,184 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:16,386 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:16,400 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:16,468 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:06:16,541 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:16,734 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:16,745 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:16,815 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:06:16,889 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:17,087 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:17,102 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:17,172 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:06:17,246 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:17,446 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:17,456 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:17,525 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:06:17,598 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:17,803 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:17,812 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:17,882 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:06:17,956 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:18,155 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:18,169 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:18,241 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:06:18,241 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-19 00:06:18,244 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:06:18,318 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:18,727 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:18,741 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:18,811 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:06:18,884 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:19,082 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:19,096 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:19,166 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:06:19,239 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:19,451 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:19,467 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:19,536 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:06:19,615 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:19,808 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:19,821 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:19,890 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:06:19,962 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:20,162 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:20,175 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:20,244 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:06:20,317 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:20,526 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:20,536 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:20,604 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:06:20,678 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:21,121 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:21,132 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:21,203 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:06:21,277 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:21,473 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:21,486 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:21,555 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:06:21,629 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:21,825 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:21,837 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:21,907 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:06:21,981 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:06:22,177 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:06:22,195 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:06:22,276 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:06:22,277 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-19 00:06:22,304 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-19 00:06:22,306 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:06:25,714 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-19 00:06:25,716 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:06:26,293 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:07:36,845 INFO MainThread eval_mmteb_v2.py:443] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-19 00:07:36,846 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:07:36,846 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: safetensors +[2026-05-19 00:07:36,846 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:07:36,958 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-19 00:07:37,028 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-19 00:07:39,878 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-19 00:07:40,789 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-19 00:07:40,789 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-19 00:07:40,790 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:07:44,030 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 00:07:48,678 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-19 00:07:48,691 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-19 00:07:48,703 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-19 00:07:48,734 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-19 00:07:48,736 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:07:51,588 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-19 00:07:51,589 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:07:52,229 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:07:57,781 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:07:57,790 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:07:58,958 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:07:59,445 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:07:59,637 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:07:59,644 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:00,509 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:08:00,987 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:01,181 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:01,188 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:02,064 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:08:02,545 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:02,774 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:02,788 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:03,948 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:08:04,461 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:04,652 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:04,662 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:05,756 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:08:06,234 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:06,448 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:06,458 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:07,322 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:08:07,802 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:08,020 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:08,033 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:09,083 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:08:09,564 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:09,815 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:09,826 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:10,741 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:08:11,220 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:11,415 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:11,421 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:12,340 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:08:12,819 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:13,029 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:13,038 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:13,958 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-19 00:08:13,959 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-19 00:08:13,991 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-19 00:08:13,993 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:08:17,003 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-19 00:08:17,004 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:08:17,064 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:17,501 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:17,514 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:17,569 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:08:17,626 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:17,858 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:17,872 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:17,942 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:08:18,016 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:18,216 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:18,230 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:18,299 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:08:18,372 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:18,575 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:18,590 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:18,660 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:08:18,735 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:18,941 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:18,953 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:19,022 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:08:19,095 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:19,297 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:19,309 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:19,378 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:08:19,452 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:19,648 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:19,662 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:19,716 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:08:19,772 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:19,975 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:19,988 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:20,045 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:08:20,102 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:20,306 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:20,320 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:20,375 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:08:20,432 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:20,620 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:20,632 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:20,726 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:08:20,727 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-19 00:08:20,728 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:08:20,784 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:21,165 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:21,177 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:21,233 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:08:21,289 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:21,503 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:21,518 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:21,586 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:08:21,659 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:21,850 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:21,862 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:21,916 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:08:21,972 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:22,152 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:22,162 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:22,218 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:08:22,274 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:22,473 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:22,481 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:22,535 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:08:22,591 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:22,991 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:23,005 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:23,076 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:08:23,150 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:23,355 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:23,368 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:23,422 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:08:23,478 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:23,675 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:23,690 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:23,761 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:08:23,834 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:24,033 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:24,044 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:24,099 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:08:24,156 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:24,358 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:24,371 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:24,444 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:08:24,445 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-19 00:08:24,470 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-19 00:08:24,472 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:08:27,328 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-19 00:08:27,330 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:08:27,776 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:33,413 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:33,418 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:34,733 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:08:35,180 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:35,381 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:35,389 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:36,720 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:08:37,157 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:37,357 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:37,361 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:38,791 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:08:39,364 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:39,568 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:39,573 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:40,990 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:08:41,426 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:41,618 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:41,622 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:42,759 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:08:43,197 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:43,405 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:43,408 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:44,545 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:08:44,984 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:45,184 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:45,192 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:46,369 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:08:46,812 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:47,000 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:47,008 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:48,098 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:08:48,536 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:48,734 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:48,742 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:49,830 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:08:50,269 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:08:50,472 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:08:50,488 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:08:51,587 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-19 00:08:51,588 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-19 00:08:51,619 INFO MainThread eval_mmteb_v2.py:322] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-19 00:08:54,665 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-19 00:08:54,670 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-19 00:09:03,910 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-19 00:09:06,303 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-19 00:09:06,867 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-19 00:09:15,638 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-19 00:09:23,448 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-19 00:09:23,451 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-19 00:09:23,488 INFO MainThread eval_mmteb_v2.py:322] [6/8] Evaluating: SICK-R +[2026-05-19 00:09:26,522 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-19 00:09:26,620 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:09:36,603 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:09:46,950 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:09:47,155 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:09:47,228 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-19 00:09:47,262 INFO MainThread eval_mmteb_v2.py:322] [7/8] Evaluating: STSBenchmark +[2026-05-19 00:09:51,247 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-19 00:09:51,266 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:09:53,114 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:09:54,881 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:09:54,897 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:09:54,913 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-19 00:09:54,940 INFO MainThread eval_mmteb_v2.py:322] [8/8] Evaluating: STS17 +[2026-05-19 00:09:57,182 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-19 00:09:57,220 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:06,116 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:14,730 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:14,770 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:14,795 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-19 00:10:14,800 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:15,162 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:15,505 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:15,509 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:15,527 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-19 00:10:15,531 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:15,795 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:16,154 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:16,160 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:16,184 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-19 00:10:16,189 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:16,462 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:16,764 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:16,771 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:16,798 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-19 00:10:16,803 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:17,079 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:17,358 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:17,363 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:17,389 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-19 00:10:17,394 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:17,665 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:17,999 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:18,003 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:18,030 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-19 00:10:18,035 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:18,414 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:18,694 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:18,700 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:18,729 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-19 00:10:18,734 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:19,090 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:19,427 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:19,432 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:19,474 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-19 00:10:19,479 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:19,800 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:20,073 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:20,077 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:20,108 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-19 00:10:20,113 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:20,444 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:20,717 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:20,724 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:20,756 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-19 00:10:20,761 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:10:21,133 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:10:21,411 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:10:21,416 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:10:21,455 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-19 00:10:21,505 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:10:21,505 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: f16 +[2026-05-19 00:10:21,506 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:10:21,506 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-f16-gguf +[2026-05-19 00:10:21,506 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:10:21,506 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 00:45:04,310 INFO MainThread eval_mmteb_v2.py:443] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-19 00:45:04,310 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:45:04,310 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: safetensors +[2026-05-19 00:45:04,310 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:45:04,420 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-19 00:45:04,509 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-19 00:45:07,042 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-19 00:45:08,093 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-19 00:45:08,093 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-19 00:45:08,093 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:45:11,261 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 00:45:15,870 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-19 00:45:15,887 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-19 00:45:15,899 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-19 00:45:15,930 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-19 00:45:15,933 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:45:18,885 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-19 00:45:18,886 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:45:19,525 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:25,103 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:25,114 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:26,293 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:45:26,920 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:27,143 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:27,156 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:28,294 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:45:28,912 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:29,127 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:29,137 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:30,299 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:45:30,940 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:31,168 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:31,179 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:32,353 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:45:32,947 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:33,155 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:33,163 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:34,093 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:45:34,578 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:34,791 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:34,803 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:35,978 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:45:36,592 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:36,796 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:36,813 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:37,968 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:45:38,595 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:38,811 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:38,824 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:39,986 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:45:40,636 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:40,861 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:40,870 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:42,027 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:45:42,655 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:42,853 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:42,864 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:44,048 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-19 00:45:44,050 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-19 00:45:44,086 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-19 00:45:44,089 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:45:47,099 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-19 00:45:47,101 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:45:47,177 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:47,640 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:47,650 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:47,719 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:45:47,792 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:47,993 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:48,005 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:48,075 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:45:48,150 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:48,356 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:48,369 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:48,439 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:45:48,513 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:48,718 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:48,731 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:48,800 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:45:48,874 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:49,081 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:49,093 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:49,163 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:45:49,236 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:49,443 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:49,457 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:49,533 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:45:49,606 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:49,803 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:49,819 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:49,924 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:45:49,997 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:50,215 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:50,231 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:50,300 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:45:50,373 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:50,601 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:50,612 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:50,682 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:45:50,755 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:50,969 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:50,979 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:51,051 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:45:51,051 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-19 00:45:51,054 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:45:51,126 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:51,567 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:51,582 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:51,651 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:45:51,724 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:51,940 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:51,952 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:52,021 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:45:52,096 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:52,306 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:52,323 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:52,392 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:45:52,467 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:52,686 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:52,698 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:52,771 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:45:52,850 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:53,055 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:53,066 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:53,142 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:45:53,218 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:53,838 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:53,850 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:53,921 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:45:53,994 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:54,200 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:54,212 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:54,283 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:45:54,356 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:54,561 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:54,574 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:54,643 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:45:54,718 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:54,927 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:54,941 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:55,011 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:45:55,085 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:45:55,289 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:45:55,302 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:45:55,375 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:45:55,377 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-19 00:45:55,404 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-19 00:45:55,406 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:45:58,318 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-19 00:45:58,320 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:45:58,899 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:04,612 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:04,622 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:06,043 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:46:06,614 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:06,827 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:06,834 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:08,211 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:46:08,659 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:08,856 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:08,863 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:09,979 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:46:10,424 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:10,628 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:10,632 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:11,737 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:46:12,182 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:12,400 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:12,407 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:13,879 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:46:14,428 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:14,619 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:14,628 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:15,736 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:46:16,180 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:16,406 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:16,413 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:17,868 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:46:18,316 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:18,501 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:18,512 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:19,621 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:46:20,062 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:20,278 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:20,289 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:21,746 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:46:22,315 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:46:22,523 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:46:22,529 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:46:23,846 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-19 00:46:23,848 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-19 00:46:23,880 INFO MainThread eval_mmteb_v2.py:322] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-19 00:46:26,818 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-19 00:46:26,822 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-19 00:46:36,496 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-19 00:46:38,890 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-19 00:46:39,347 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-19 00:46:48,146 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-19 00:46:55,528 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-19 00:46:55,530 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-19 00:46:55,571 INFO MainThread eval_mmteb_v2.py:322] [6/8] Evaluating: SICK-R +[2026-05-19 00:46:58,547 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-19 00:46:58,675 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:47:09,255 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:47:19,617 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:47:19,816 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:47:19,899 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-19 00:47:19,929 INFO MainThread eval_mmteb_v2.py:322] [7/8] Evaluating: STSBenchmark +[2026-05-19 00:47:23,858 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-19 00:47:23,878 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:47:25,749 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:47:27,524 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:47:27,539 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:47:27,556 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-19 00:47:27,583 INFO MainThread eval_mmteb_v2.py:322] [8/8] Evaluating: STS17 +[2026-05-19 00:47:30,706 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-19 00:47:30,744 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:47:39,633 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:47:59,348 INFO MainThread eval_mmteb_v2.py:443] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-19 00:47:59,349 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:47:59,349 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: safetensors +[2026-05-19 00:47:59,349 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:47:59,454 INFO MainThread simple_encoder.py:65] Use varlen batching: False +[2026-05-19 00:47:59,527 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-19 00:48:02,611 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +[2026-05-19 00:48:03,700 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-19 00:48:03,700 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-19 00:48:03,700 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:48:06,929 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 00:48:11,563 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-19 00:48:11,574 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-19 00:48:11,584 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-19 00:48:11,614 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-19 00:48:11,615 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:48:15,074 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-19 00:48:15,075 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:48:15,711 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:21,361 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:21,371 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:22,503 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:48:23,131 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:23,336 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:23,348 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:24,480 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:48:25,043 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:25,245 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:25,255 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:26,274 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:48:26,757 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:26,946 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:26,954 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:27,838 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:48:28,319 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:28,523 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:28,533 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:29,661 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:48:30,208 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:30,405 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:30,416 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:31,507 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:48:31,997 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:32,193 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:32,208 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:33,343 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:48:33,829 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:34,018 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:34,030 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:35,204 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:48:35,699 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:35,908 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:35,918 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:36,829 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:48:37,319 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:37,531 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:37,541 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:38,693 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-19 00:48:38,695 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-19 00:48:38,731 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-19 00:48:38,733 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:48:41,684 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-19 00:48:41,686 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:48:41,762 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:42,202 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:42,215 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:42,285 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:48:42,359 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:42,547 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:42,561 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:42,630 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:48:42,705 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:42,908 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:42,923 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:43,028 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:48:43,103 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:43,301 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:43,311 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:43,417 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:48:43,491 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:43,680 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:43,689 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:43,795 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:48:43,868 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:44,061 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:44,071 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:44,177 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:48:44,251 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:44,449 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:44,459 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:44,529 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:48:44,602 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:44,792 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:44,806 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:44,875 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:48:44,950 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:45,143 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:45,155 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:45,225 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:48:45,299 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:45,492 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:45,505 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:45,576 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:48:45,577 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-19 00:48:45,579 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:48:45,652 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:46,066 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:46,077 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:46,182 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:48:46,255 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:46,462 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:46,475 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:46,558 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:48:46,633 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:46,824 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:46,838 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:46,907 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:48:46,981 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:47,183 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:47,197 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:47,267 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:48:47,342 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:47,781 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:47,793 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:47,899 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:48:47,976 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:48,170 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:48,180 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:48,285 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:48:48,359 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:48,553 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:48,563 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:48,633 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:48:48,713 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:48,918 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:48,933 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:49,003 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:48:49,077 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:49,264 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:49,277 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:49,346 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:48:49,420 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:49,611 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:49,620 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:48:49,727 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 00:48:49,728 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-19 00:48:49,753 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-19 00:48:49,755 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:48:52,662 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-19 00:48:52,663 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:48:53,238 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:48:58,885 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:48:58,890 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:00,312 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 00:49:00,886 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:01,079 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:01,083 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:02,385 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 00:49:02,832 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:03,014 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:03,023 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:04,370 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 00:49:04,815 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:05,000 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:05,010 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:06,107 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 00:49:06,555 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:06,754 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:06,765 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:08,184 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 00:49:08,631 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:08,810 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:08,818 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:09,961 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 00:49:10,408 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:10,611 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:10,615 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:11,759 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 00:49:12,203 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:12,405 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:12,412 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:13,508 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 00:49:13,952 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:14,148 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:14,152 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:15,261 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 00:49:15,706 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 00:49:15,892 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 00:49:15,899 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 00:49:17,038 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-19 00:49:17,039 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-19 00:49:17,070 INFO MainThread eval_mmteb_v2.py:322] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-19 00:49:20,028 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-19 00:49:20,032 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-19 00:49:29,612 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-19 00:49:32,012 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-19 00:49:32,574 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-19 00:49:41,368 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-19 00:49:49,102 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-19 00:49:49,104 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-19 00:49:49,139 INFO MainThread eval_mmteb_v2.py:322] [6/8] Evaluating: SICK-R +[2026-05-19 00:49:52,132 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-19 00:49:52,260 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:02,813 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:13,170 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:13,380 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:13,463 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-19 00:50:13,494 INFO MainThread eval_mmteb_v2.py:322] [7/8] Evaluating: STSBenchmark +[2026-05-19 00:50:17,406 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-19 00:50:17,426 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:19,276 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:21,044 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:21,059 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:21,077 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-19 00:50:21,104 INFO MainThread eval_mmteb_v2.py:322] [8/8] Evaluating: STS17 +[2026-05-19 00:50:23,154 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-19 00:50:23,191 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:32,316 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:40,532 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:40,568 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:40,603 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-19 00:50:40,607 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:40,956 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:41,296 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:41,304 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:41,340 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-19 00:50:41,345 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:41,624 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:42,002 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:42,008 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:42,044 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-19 00:50:42,049 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:42,325 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:42,618 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:42,622 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:42,658 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-19 00:50:42,663 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:42,936 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:43,205 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:43,212 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:43,248 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-19 00:50:43,253 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:43,526 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:43,860 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:43,865 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:43,901 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-19 00:50:43,906 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:44,294 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:44,573 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:44,580 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:44,616 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-19 00:50:44,621 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:44,973 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:45,307 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:45,313 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:45,359 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-19 00:50:45,364 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:45,687 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:45,967 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:45,974 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:46,009 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-19 00:50:46,014 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:46,342 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:46,634 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:46,642 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:46,679 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-19 00:50:46,684 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 00:50:47,055 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 00:50:47,330 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 00:50:47,337 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 00:50:47,375 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-19 00:50:47,418 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:50:47,418 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: f16 +[2026-05-19 00:50:47,418 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:50:47,419 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-f16-gguf +[2026-05-19 00:50:47,419 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:50:47,419 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 00:53:37,642 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-19 00:53:37,648 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-19 00:53:37,659 INFO MainThread abstask.py:608] Unloaded dataset BornholmBitextMining from memory. +[2026-05-19 00:53:37,659 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-19 00:53:37,660 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-19 00:53:37,660 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 00:53:37,660 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-19 00:53:37,661 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 00:53:38,289 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:05:22,407 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:05:22,415 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:05:23,541 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 01:05:24,132 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:05:31,659 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:05:31,667 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:05:32,807 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 01:05:33,437 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:05:42,331 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:05:42,337 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:05:43,510 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 01:05:44,044 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:05:51,906 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:05:51,914 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:05:52,822 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 01:05:53,301 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:06:01,754 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:06:01,762 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:06:02,925 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 01:06:03,458 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:06:10,450 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:06:10,459 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:06:11,629 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 01:06:12,253 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:06:18,781 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:06:18,791 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:06:19,916 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 01:06:20,399 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:06:27,999 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:06:28,007 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:06:29,183 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 01:06:29,689 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:06:37,451 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:06:37,461 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:06:38,640 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 01:06:39,150 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:06:46,457 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:06:46,463 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:06:47,391 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-19 01:06:47,392 INFO MainThread abstask.py:608] Unloaded dataset FinancialPhrasebankClassification from memory. +[2026-05-19 01:06:47,392 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-19 01:06:47,393 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-19 01:06:47,393 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 01:06:47,394 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-19 01:06:47,395 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 01:06:47,454 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:07:17,878 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:07:17,889 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:07:17,959 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 01:07:18,033 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:07:25,776 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:07:25,786 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:07:25,857 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 01:07:25,931 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:07:32,901 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:07:32,911 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:07:32,982 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 01:07:33,058 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:07:39,908 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:07:39,919 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:07:39,990 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 01:07:40,065 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:07:47,432 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:07:47,443 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:07:47,513 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 01:07:47,588 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:07:54,680 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:07:54,690 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:07:54,762 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 01:07:54,836 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:08:02,172 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:08:02,183 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:08:02,254 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 01:08:02,328 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:08:09,064 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:08:09,075 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:08:09,146 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 01:08:09,220 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:08:15,665 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:08:15,674 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:08:15,755 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 01:08:15,829 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:08:22,912 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:08:22,923 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:08:23,001 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 01:08:23,002 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-19 01:08:23,004 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 01:08:23,079 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:08:54,126 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:08:54,135 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:08:54,206 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 01:08:54,280 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:01,185 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:01,195 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:01,266 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 01:09:01,341 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:08,191 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:08,202 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:08,273 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 01:09:08,348 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:15,294 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:15,306 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:15,385 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 01:09:15,465 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:22,839 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:22,848 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:22,905 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 01:09:22,963 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:30,068 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:30,078 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:30,149 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 01:09:30,223 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:37,618 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:37,627 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:37,683 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 01:09:37,742 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:44,724 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:44,733 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:44,789 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 01:09:44,847 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:51,361 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:51,372 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:51,444 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 01:09:51,518 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:09:58,634 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:09:58,643 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:09:58,715 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 01:09:58,716 INFO MainThread abstask.py:608] Unloaded dataset PoemSentimentClassification from memory. +[2026-05-19 01:09:58,717 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-19 01:09:58,718 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-19 01:09:58,718 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 01:09:58,718 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-19 01:09:58,720 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 01:09:59,257 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:20:37,243 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:20:37,248 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:20:38,681 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 01:20:39,211 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:20:43,735 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:20:43,741 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:20:44,838 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 01:20:45,279 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:20:49,213 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:20:49,217 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:20:50,644 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 01:20:51,118 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:20:55,860 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:20:55,864 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:20:56,965 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 01:20:57,408 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:21:02,089 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:21:02,095 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:21:03,486 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 01:21:03,928 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:21:08,191 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:21:08,195 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:21:09,340 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 01:21:09,784 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:21:13,872 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:21:13,877 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:21:15,019 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 01:21:15,462 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:21:20,457 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:21:20,461 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:21:21,605 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 01:21:22,048 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:21:27,479 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:21:27,486 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:21:28,983 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 01:21:29,579 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 01:21:34,313 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 01:21:34,320 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 01:21:35,800 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-19 01:21:35,802 INFO MainThread abstask.py:608] Unloaded dataset KorSarcasmClassification from memory. +[2026-05-19 01:21:35,802 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-19 01:21:35,803 INFO MainThread eval_mmteb_v2.py:322] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-19 01:21:35,803 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-19 01:21:35,808 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-19 01:21:45,502 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-19 01:24:24,233 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-19 01:24:24,798 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-19 01:34:20,413 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-19 01:34:25,170 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-19 01:34:25,173 INFO MainThread abstask.py:608] Unloaded dataset KorHateSpeechMLClassification from memory. +[2026-05-19 01:34:25,173 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-19 01:34:25,173 INFO MainThread eval_mmteb_v2.py:322] [6/8] Evaluating: SICK-R +[2026-05-19 01:34:25,174 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-19 01:34:25,270 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 01:55:21,320 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:16:03,436 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:16:03,639 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:16:03,713 INFO MainThread abstask.py:608] Unloaded dataset SICK-R from memory. +[2026-05-19 02:16:03,713 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-19 02:16:03,715 INFO MainThread eval_mmteb_v2.py:322] [7/8] Evaluating: STSBenchmark +[2026-05-19 02:16:03,715 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-19 02:16:03,736 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:19:23,443 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:22:39,751 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:22:39,769 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:22:39,785 INFO MainThread abstask.py:608] Unloaded dataset STSBenchmark from memory. +[2026-05-19 02:22:39,785 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-19 02:22:39,786 INFO MainThread eval_mmteb_v2.py:322] [8/8] Evaluating: STS17 +[2026-05-19 02:22:39,786 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-19 02:22:39,824 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:32:47,362 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:42:53,238 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:42:53,271 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:42:53,292 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-19 02:42:53,296 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:43:30,841 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:44:08,957 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:44:08,962 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:44:08,986 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-19 02:44:08,991 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:44:39,336 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:45:18,792 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:45:18,796 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:45:18,821 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-19 02:45:18,826 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:45:50,093 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:46:25,776 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:46:25,779 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:46:25,801 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-19 02:46:25,804 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:46:57,299 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:47:28,581 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:47:28,586 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:47:28,613 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-19 02:47:28,618 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:47:58,838 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:48:38,129 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:48:38,133 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:48:38,162 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-19 02:48:38,167 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:49:17,169 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:49:48,870 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:49:48,875 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:49:48,899 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-19 02:49:48,903 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:50:27,281 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:51:05,999 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:51:06,002 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:51:06,028 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-19 02:51:06,032 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:51:43,999 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:52:16,006 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:52:16,010 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:52:16,044 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-19 02:52:16,049 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:52:56,822 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:53:27,613 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:53:27,618 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:53:27,656 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-19 02:53:27,661 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 02:54:07,299 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 02:54:38,202 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 02:54:38,206 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 02:54:38,246 INFO MainThread abstask.py:608] Unloaded dataset STS17 from memory. +[2026-05-19 02:54:38,246 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-19 02:54:38,248 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 02:54:38,248 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: i2s +[2026-05-19 02:54:38,248 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 02:54:38,248 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-i2s-gguf +[2026-05-19 02:54:38,248 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 02:54:40,562 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... +[2026-05-19 02:56:32,829 INFO MainThread bitext_mining_evaluator.py:62] Finding nearest neighbors... +[2026-05-19 02:56:32,834 INFO MainThread bitext_mining.py:188] Computing metrics... +[2026-05-19 02:56:32,843 INFO MainThread evaluate.py:481] ✓ Finished evaluation for BornholmBitextMining +[2026-05-19 02:56:32,843 INFO MainThread eval_mmteb_v2.py:322] [2/8] Evaluating: FinancialPhrasebankClassification +[2026-05-19 02:56:32,844 WARNING MainThread abstask.py:105] Dataset 'FinancialPhrasebankClassification' is superseded by 'FinancialPhrasebankClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 02:56:34,808 INFO MainThread classification.py:158] Task: FinancialPhrasebankClassification, split: train, subset: default. Running... +[2026-05-19 02:56:34,809 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 02:56:35,439 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:04:32,619 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:04:32,628 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:04:33,791 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 03:04:34,311 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:04:39,235 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:04:39,242 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:04:40,147 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 03:04:40,625 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:04:46,826 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:04:46,832 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:04:47,993 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 03:04:48,603 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:04:53,715 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:04:53,724 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:04:54,902 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 03:04:55,531 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:01,326 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:01,334 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:02,529 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 03:05:03,155 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:07,713 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:07,721 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:08,595 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 03:05:09,073 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:13,387 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:13,397 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:14,532 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 03:05:15,160 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:19,702 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:19,709 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:20,854 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 03:05:21,489 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:27,123 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:27,131 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:28,322 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 03:05:28,915 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:33,835 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:33,844 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:35,048 INFO MainThread classification.py:247] Running FinancialPhrasebankClassification - Finished. +[2026-05-19 03:05:35,050 INFO MainThread evaluate.py:481] ✓ Finished evaluation for FinancialPhrasebankClassification +[2026-05-19 03:05:35,051 INFO MainThread eval_mmteb_v2.py:322] [3/8] Evaluating: PoemSentimentClassification +[2026-05-19 03:05:35,051 WARNING MainThread abstask.py:105] Dataset 'PoemSentimentClassification' is superseded by 'PoemSentimentClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 03:05:37,298 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: validation, subset: default. Running... +[2026-05-19 03:05:37,300 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 03:05:37,374 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:05:57,877 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:05:57,886 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:05:57,942 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 03:05:58,000 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:02,748 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:02,757 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:02,814 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 03:06:02,872 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:07,264 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:07,274 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:07,345 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 03:06:07,420 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:11,883 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:11,894 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:11,966 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 03:06:12,046 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:16,733 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:16,743 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:16,814 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 03:06:16,888 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:21,487 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:21,495 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:21,552 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 03:06:21,610 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:26,285 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:26,295 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:26,366 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 03:06:26,440 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:31,185 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:31,196 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:31,267 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 03:06:31,342 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:35,561 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:35,569 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:35,641 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 03:06:35,716 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:06:40,508 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:06:40,517 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:06:40,574 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 03:06:40,574 INFO MainThread classification.py:158] Task: PoemSentimentClassification, split: test, subset: default. Running... +[2026-05-19 03:06:40,576 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 03:06:40,635 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:00,768 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:00,778 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:00,848 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 03:07:00,922 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:05,569 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:05,579 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:05,649 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 03:07:05,724 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:10,084 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:10,092 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:10,149 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 03:07:10,207 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:14,689 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:14,699 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:14,769 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 03:07:14,843 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:19,312 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:19,322 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:19,394 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 03:07:19,470 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:24,024 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:24,034 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:24,105 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 03:07:24,179 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:28,968 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:28,978 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:29,048 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 03:07:29,123 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:33,743 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:33,752 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:33,810 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 03:07:33,868 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:38,212 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:38,219 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:38,275 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 03:07:38,333 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:07:43,060 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:07:43,071 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:07:43,142 INFO MainThread classification.py:247] Running PoemSentimentClassification - Finished. +[2026-05-19 03:07:43,144 INFO MainThread evaluate.py:481] ✓ Finished evaluation for PoemSentimentClassification +[2026-05-19 03:07:43,145 INFO MainThread eval_mmteb_v2.py:322] [4/8] Evaluating: KorSarcasmClassification +[2026-05-19 03:07:43,145 WARNING MainThread abstask.py:105] Dataset 'KorSarcasmClassification' is superseded by 'KorSarcasmClassification.v2', you might consider using the newer version of the dataset. +[2026-05-19 03:07:45,191 INFO MainThread classification.py:158] Task: KorSarcasmClassification, split: train, subset: default. Running... +[2026-05-19 03:07:45,193 INFO MainThread classification.py:202] Running experiment (0/10) +[2026-05-19 03:07:45,768 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:03,680 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:03,686 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:04,996 INFO MainThread classification.py:202] Running experiment (1/10) +[2026-05-19 03:15:05,437 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:08,150 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:08,159 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:09,636 INFO MainThread classification.py:202] Running experiment (2/10) +[2026-05-19 03:15:10,211 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:12,590 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:12,593 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:13,705 INFO MainThread classification.py:202] Running experiment (3/10) +[2026-05-19 03:15:14,145 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:16,937 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:16,941 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:18,052 INFO MainThread classification.py:202] Running experiment (4/10) +[2026-05-19 03:15:18,493 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:21,580 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:21,585 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:22,683 INFO MainThread classification.py:202] Running experiment (5/10) +[2026-05-19 03:15:23,123 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:25,529 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:25,535 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:26,928 INFO MainThread classification.py:202] Running experiment (6/10) +[2026-05-19 03:15:27,371 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:29,700 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:29,706 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:31,135 INFO MainThread classification.py:202] Running experiment (7/10) +[2026-05-19 03:15:31,578 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:34,370 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:34,374 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:35,518 INFO MainThread classification.py:202] Running experiment (8/10) +[2026-05-19 03:15:35,961 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:39,472 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:39,479 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:40,952 INFO MainThread classification.py:202] Running experiment (9/10) +[2026-05-19 03:15:41,394 INFO MainThread sklearn_evaluator.py:91] Running - Encoding samples... +[2026-05-19 03:15:44,273 INFO MainThread sklearn_evaluator.py:108] Running - Fitting classifier... +[2026-05-19 03:15:44,278 INFO MainThread sklearn_evaluator.py:112] Running - Evaluating classifier... +[2026-05-19 03:15:45,425 INFO MainThread classification.py:247] Running KorSarcasmClassification - Finished. +[2026-05-19 03:15:45,426 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorSarcasmClassification +[2026-05-19 03:15:45,427 INFO MainThread eval_mmteb_v2.py:322] [5/8] Evaluating: KorHateSpeechMLClassification +[2026-05-19 03:15:47,742 INFO MainThread classification.py:158] Task: KorHateSpeechMLClassification, split: test, subset: default. Running... +[2026-05-19 03:15:47,746 INFO MainThread multilabel_classification.py:95] Running multilabel classification task - Sampling training data... +[2026-05-19 03:15:56,992 INFO MainThread multilabel_classification.py:118] Running multilabel classification - Encoding training set... +[2026-05-19 03:17:40,141 WARNING MainThread multilabel_classification.py:138] Couldn't subsample, continuing with the entire test set. +[2026-05-19 03:17:40,571 INFO MainThread multilabel_classification.py:147] Running multilabel classification - Encoding test set... +[2026-05-19 03:24:15,789 INFO MainThread multilabel_classification.py:158] Running multilabel classification - Evaluating classifiers... +[2026-05-19 03:24:20,806 INFO MainThread multilabel_classification.py:187] Running multilabel classification - Finished. +[2026-05-19 03:24:20,808 INFO MainThread evaluate.py:481] ✓ Finished evaluation for KorHateSpeechMLClassification +[2026-05-19 03:24:20,809 INFO MainThread eval_mmteb_v2.py:322] [6/8] Evaluating: SICK-R +[2026-05-19 03:24:23,702 INFO MainThread abstask.py:176] Running task SICK-R (split='test', hf_subset='default')... +[2026-05-19 03:24:23,799 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 03:38:18,444 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 03:52:13,855 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 03:52:14,057 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 03:52:14,133 INFO MainThread evaluate.py:481] ✓ Finished evaluation for SICK-R +[2026-05-19 03:52:14,134 INFO MainThread eval_mmteb_v2.py:322] [7/8] Evaluating: STSBenchmark +[2026-05-19 03:52:17,707 INFO MainThread abstask.py:176] Running task STSBenchmark (split='test', hf_subset='default')... +[2026-05-19 03:52:17,728 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 03:54:32,599 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 03:56:46,148 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 03:56:46,168 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 03:56:46,184 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STSBenchmark +[2026-05-19 03:56:46,185 INFO MainThread eval_mmteb_v2.py:322] [8/8] Evaluating: STS17 +[2026-05-19 03:56:48,572 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ko-ko')... +[2026-05-19 03:56:48,611 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:03:47,373 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:10:48,647 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:10:48,680 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:10:48,701 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='ar-ar')... +[2026-05-19 04:10:48,705 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:11:15,232 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:11:41,864 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:11:41,869 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:11:41,893 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-ar')... +[2026-05-19 04:11:41,898 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:12:01,942 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:12:29,616 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:12:29,620 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:12:29,641 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-de')... +[2026-05-19 04:12:29,646 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:12:51,084 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:13:15,765 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:13:15,772 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:13:15,798 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-en')... +[2026-05-19 04:13:15,804 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:13:37,311 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:13:57,934 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:13:57,938 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:13:57,967 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='en-tr')... +[2026-05-19 04:13:57,972 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:14:18,876 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:14:46,256 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:14:46,260 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:14:46,283 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-en')... +[2026-05-19 04:14:46,287 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:15:13,682 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:15:34,885 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:15:34,889 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:15:34,914 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='es-es')... +[2026-05-19 04:15:34,918 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:16:01,326 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:16:28,174 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:16:28,178 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:16:28,210 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='fr-en')... +[2026-05-19 04:16:28,215 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:16:54,908 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:17:15,458 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:17:15,463 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:17:15,498 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='it-en')... +[2026-05-19 04:17:15,503 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:17:43,688 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:18:04,801 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:18:04,805 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:18:04,839 INFO MainThread abstask.py:176] Running task STS17 (split='test', hf_subset='nl-en')... +[2026-05-19 04:18:04,844 INFO MainThread any_sts_evaluator.py:65] Running semantic similarity - Encoding samples (1/2) +[2026-05-19 04:18:32,254 INFO MainThread any_sts_evaluator.py:80] Running semantic similarity - Encoding samples (2/2)... +[2026-05-19 04:18:53,425 INFO MainThread any_sts_evaluator.py:95] Running semantic similarity - Evaluating similarity... +[2026-05-19 04:18:53,428 INFO MainThread any_sts_evaluator.py:101] Running semantic similarity - Finished. +[2026-05-19 04:18:53,459 INFO MainThread evaluate.py:481] ✓ Finished evaluation for STS17 +[2026-05-19 04:18:53,461 INFO MainThread eval_mmteb_v2.py:504] Done! diff --git a/docs/bitnet-embeddings-qwen3-gguf-conversion.md b/docs/bitnet-embeddings-qwen3-gguf-conversion.md new file mode 100644 index 000000000..516142250 --- /dev/null +++ b/docs/bitnet-embeddings-qwen3-gguf-conversion.md @@ -0,0 +1,275 @@ +# BitNet Embeddings (Qwen3) GGUF Conversion Implementation + +## 1. Background + +`bitnet-embeddings-0.6b` is a Qwen3-based embedding model with BitNet per-projection RMSNorm (`BitLinear`). Each linear projection (q/k/v/o/gate/up/down) has a `.norm.weight` that applies RMSNorm to the input **before** the matmul: + +``` +x → RMSNorm(x, norm.weight) → activation_quant(8bit) → matmul(weight_quant(ternary)) +``` + +This pattern does **not** exist in any standard llama.cpp architecture: +- Standard Qwen3: no per-projection norms +- Standard BitNet: has `attn_sub_norm`/`ffn_sub_norm` at different positions (after attention/gate*up, not before each projection) + +### Model Config + +- Architecture: `Qwen3Model` +- hidden_size: 1024, num_attention_heads: 16, num_key_value_heads: 8 +- head_dim: 128 (note: != hidden_size/num_heads = 64) +- intermediate_size: 3072, num_hidden_layers: 28 +- tie_word_embeddings: true +- rope_theta: 1000000, rms_norm_eps: 1e-06 + +### Per-Layer Tensors (7 extra norm tensors per layer) + +| Tensor | Shape | +|--------|-------| +| `self_attn.q_proj.norm.weight` | [1024] | +| `self_attn.k_proj.norm.weight` | [1024] | +| `self_attn.v_proj.norm.weight` | [1024] | +| `self_attn.o_proj.norm.weight` | [2048] | +| `mlp.gate_proj.norm.weight` | [1024] | +| `mlp.up_proj.norm.weight` | [1024] | +| `mlp.down_proj.norm.weight` | [3072] | + +--- + +## 2. Implementation Plan + +### Step 1: Conversion Script +Create a standalone Python script to convert safetensors → GGUF with proper tensor name mapping for all 7 per-projection norms. + +### Step 2: C++ llama.cpp Modifications +Add support for the new tensor types in `llama.cpp`: enums, name mappings, struct fields, loading, and inference graph construction. + +### Step 3: Precision Verification +Verify tensor-level and inference-level precision alignment. + +--- + +## 3. GGUF Tensor Name Mapping + +| HF Name | GGUF Name | Notes | +|----------|-----------|-------| +| `embed_tokens.weight` | `token_embd.weight` | | +| `norm.weight` | `output_norm.weight` | | +| `layers.{i}.input_layernorm.weight` | `blk.{i}.attn_norm.weight` | | +| `layers.{i}.post_attention_layernorm.weight` | `blk.{i}.ffn_norm.weight` | | +| `layers.{i}.self_attn.q_proj.weight` | `blk.{i}.attn_q.weight` | | +| `layers.{i}.self_attn.k_proj.weight` | `blk.{i}.attn_k.weight` | | +| `layers.{i}.self_attn.v_proj.weight` | `blk.{i}.attn_v.weight` | | +| `layers.{i}.self_attn.o_proj.weight` | `blk.{i}.attn_output.weight` | | +| `layers.{i}.self_attn.q_norm.weight` | `blk.{i}.attn_q_norm.weight` | QK head norm | +| `layers.{i}.self_attn.k_norm.weight` | `blk.{i}.attn_k_norm.weight` | QK head norm | +| `layers.{i}.self_attn.q_proj.norm.weight` | `blk.{i}.attn_q_norm_in.weight` | **NEW** | +| `layers.{i}.self_attn.k_proj.norm.weight` | `blk.{i}.attn_k_norm_in.weight` | **NEW** | +| `layers.{i}.self_attn.v_proj.norm.weight` | `blk.{i}.attn_v_norm_in.weight` | **NEW** | +| `layers.{i}.self_attn.o_proj.norm.weight` | `blk.{i}.attn_output_norm_in.weight` | **NEW** | +| `layers.{i}.mlp.gate_proj.weight` | `blk.{i}.ffn_gate.weight` | | +| `layers.{i}.mlp.up_proj.weight` | `blk.{i}.ffn_up.weight` | | +| `layers.{i}.mlp.down_proj.weight` | `blk.{i}.ffn_down.weight` | | +| `layers.{i}.mlp.gate_proj.norm.weight` | `blk.{i}.ffn_gate_norm_in.weight` | **NEW** | +| `layers.{i}.mlp.up_proj.norm.weight` | `blk.{i}.ffn_up_norm_in.weight` | **NEW** | +| `layers.{i}.mlp.down_proj.norm.weight` | `blk.{i}.ffn_down_norm_in.weight` | **NEW** | + +--- + +## 4. New Files Created + +### `utils/convert-bitnet-embedding-to-gguf.py` + +Standalone conversion script. Key features: +- Hardcoded HF→GGUF tensor name mapping (no dependency on llama.cpp's Python converter) +- Supports `--outtype f16` (2D weights as f16, norms as f32) and `--outtype f32` +- Writes `key_length` and `value_length` metadata for head_dim=128 (critical: default calculation would give wrong value 64) +- GPT-2 BPE tokenizer handling for Qwen3 +- Architecture string: `"qwen3"` + +Usage: +```bash +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --outfile output-f16.gguf --outtype f16 +``` + +### `scripts/verify_gguf_precision.py` + +Two-level precision verification: +- **Level 1**: Tensor-level comparison (safetensors vs GGUF, accounting for bf16→f16 conversion) +- **Level 2**: Inference-level comparison (PyTorch with BitLinear vs llama-embedding binary) + +Usage: +```bash +python3 utils/verify_gguf_precision.py \ + --model-dir /data2/.../bitnet-embeddings-0.6b \ + --gguf-file output-f16.gguf --level both +``` + +### `scripts/verify_inference_precision.py` + +Per-token hidden state comparison with monkey-patched BitLinear (disabling activation/weight quantization for fair f16 comparison). + +--- + +## 5. C++ Modifications (`3rdparty/llama.cpp/src/llama.cpp`) + +### 5.1 New Tensor Enums + +Added 7 new entries after `LLM_TENSOR_FFN_SUB_NORM`: + +```cpp +LLM_TENSOR_ATTN_Q_NORM_IN, +LLM_TENSOR_ATTN_K_NORM_IN, +LLM_TENSOR_ATTN_V_NORM_IN, +LLM_TENSOR_ATTN_OUT_NORM_IN, +LLM_TENSOR_FFN_GATE_NORM_IN, +LLM_TENSOR_FFN_UP_NORM_IN, +LLM_TENSOR_FFN_DOWN_NORM_IN, +``` + +### 5.2 Tensor Name Mappings + +Added to `LLM_ARCH_QWEN3` tensor name map: + +```cpp +{ LLM_TENSOR_ATTN_Q_NORM_IN, "blk.%d.attn_q_norm_in" }, +{ LLM_TENSOR_ATTN_K_NORM_IN, "blk.%d.attn_k_norm_in" }, +{ LLM_TENSOR_ATTN_V_NORM_IN, "blk.%d.attn_v_norm_in" }, +{ LLM_TENSOR_ATTN_OUT_NORM_IN, "blk.%d.attn_output_norm_in" }, +{ LLM_TENSOR_FFN_GATE_NORM_IN, "blk.%d.ffn_gate_norm_in" }, +{ LLM_TENSOR_FFN_UP_NORM_IN, "blk.%d.ffn_up_norm_in" }, +{ LLM_TENSOR_FFN_DOWN_NORM_IN, "blk.%d.ffn_down_norm_in" }, +``` + +### 5.3 Layer Struct Fields + +Added to `struct llama_layer`: + +```cpp +struct ggml_tensor * attn_q_norm_in; +struct ggml_tensor * attn_k_norm_in; +struct ggml_tensor * attn_v_norm_in; +struct ggml_tensor * attn_out_norm_in; +struct ggml_tensor * ffn_gate_norm_in; +struct ggml_tensor * ffn_up_norm_in; +struct ggml_tensor * ffn_down_norm_in; +``` + +### 5.4 load_tensors (LLM_ARCH_QWEN3) + +Added optional loading with `TENSOR_NOT_REQUIRED`: + +```cpp +layer.attn_q_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_k_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_v_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_V_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.attn_out_norm_in = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM_IN, "weight", i), {n_embd_head_k * n_head}, TENSOR_NOT_REQUIRED); +layer.ffn_gate_norm_in = create_tensor(tn(LLM_TENSOR_FFN_GATE_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_up_norm_in = create_tensor(tn(LLM_TENSOR_FFN_UP_NORM_IN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); +layer.ffn_down_norm_in = create_tensor(tn(LLM_TENSOR_FFN_DOWN_NORM_IN, "weight", i), {n_ff}, TENSOR_NOT_REQUIRED); +``` + +Note: `o_proj.norm` input dimension is `n_embd_head_k * n_head` (=2048), `down_proj.norm` input dimension is `n_ff` (=3072). + +### 5.5 build_qwen3() Graph Modifications + +The `build_qwen3()` function was modified to conditionally apply per-projection RMSNorm. The logic is fully backward compatible — when no `*_norm_in` tensors exist, behavior is identical to original. + +**Attention per-projection norms:** +``` +// Before Q/K/V matmul: +if (layer.attn_q_norm_in) { + cur_q = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur_q = ggml_mul(ctx, cur_q, layer.attn_q_norm_in); +} else { + cur_q = cur; +} +Qcur = ggml_mul_mat(ctx, layer.wq, cur_q); +// Similarly for K, V +``` + +**O_proj norm** requires special handling because `llm_build_kv()` normally applies `wo` internally. Solution: pass `wo=NULL` to `llm_build_kv()`, then apply norm + wo manually: + +``` +cur = llm_build_kv(..., wo=NULL, ...); // returns attention output without o_proj +if (layer.attn_out_norm_in) { + cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); + cur = ggml_mul(ctx, cur, layer.attn_out_norm_in); +} +cur = ggml_mul_mat(ctx, layer.wo, cur); +``` + +**FFN per-projection norms:** +``` +// Instead of llm_build_ffn(), manually: +if (layer.ffn_gate_norm_in) { + tmp_gate = rms_norm(cur) * gate_norm_in; +} else { + tmp_gate = cur; +} +tmp_gate = matmul(gate_proj, tmp_gate); +// Similarly for up_proj +tmp = silu(tmp_gate) * tmp_up; + +if (layer.ffn_down_norm_in) { + tmp = rms_norm(tmp) * down_norm_in; +} +cur = matmul(down_proj, tmp); +``` + +--- + +## 6. Key Issues Encountered and Solutions + +### Issue 1: Missing `output_norm.weight` +The model has `norm.weight` in safetensors but it wasn't being mapped. Added `"norm.weight": "output_norm.weight"` to the mapping. + +### Issue 2: Wrong head_dim Calculation +`head_dim=128` but `hidden_size/num_attention_heads = 1024/16 = 64`. C++ defaulted to 64, causing shape mismatch (`expected 1024,1024 got 1024,2048` for K/V). Fixed by writing `key_length` and `value_length` metadata in the GGUF. + +### Issue 3: llama-embedding Output Parsing +Initial approach failed to parse truncated output format. Fixed by using `--embd-output-format array` for clean JSON output. + +### Issue 4: PyTorch Model Ignoring Per-Projection Norms +`AutoModel.from_pretrained` uses standard Qwen3Model which doesn't know about `.norm.weight` tensors. Fixed by using `replace_linear_with_bitlinear()` from bitnet-embeddings repo and reloading weights. + +### Issue 5: Inference Precision Mismatch with BitLinear Active +PyTorch with `activation_quant`/`weight_quant` produces different results than llama.cpp f16 (expected). Fixed by monkey-patching `BitLinear.forward` to skip quantization for fair comparison. + +--- + +## 7. Verification Results + +### Level 1: Tensor Precision +- 506 tensors compared +- **Zero error** across all tensors (exact match after bf16→f16 conversion) + +### Level 2: Inference Precision (f16, no activation/weight quant) +- Per-token hidden state cosine similarity: **> 0.9999999** for all test cases +- Test texts: "hello world", "The quick brown fox...", "机器学习是人工智能的一个分支" + +--- + +## 8. Build and Run + +```bash +# 1. Convert to GGUF +python3 utils/convert-bitnet-embedding-to-gguf.py \ + /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --outfile bitnet-embeddings-0.6b-f16.gguf --outtype f16 + +# 2. Build llama.cpp +cd /home/huangxin/code_list/BitNet +cmake -B build -DLLAMA_NATIVE=OFF +cmake --build build --target llama-embedding -j$(nproc) + +# 3. Verify tensor precision +python3 scripts/verify_gguf_precision.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --gguf-file bitnet-embeddings-0.6b-f16.gguf --level 1 + +# 4. Run embedding inference +build/bin/llama-embedding -m bitnet-embeddings-0.6b-f16.gguf \ + -p "hello world" --embd-normalize 2 --embd-output-format array +``` diff --git a/docs/eval-mmteb-v2-guide.md b/docs/eval-mmteb-v2-guide.md new file mode 100644 index 000000000..7c6099f73 --- /dev/null +++ b/docs/eval-mmteb-v2-guide.md @@ -0,0 +1,211 @@ +# bitnet-embeddings-0.6b MMTEB v2 Evaluation Guide + +## Overview + +This document describes how to evaluate the `bitnet-embeddings-0.6b` model on the MTEB multilingual v2 benchmark across three model formats: + +| Format | File | Inference | Description | +|--------|------|-----------|-------------| +| safetensors | `model.safetensors` | PyTorch GPU | Baseline (BitLinear quantized weights) | +| F16 GGUF | `bitnet-embeddings-0.6b-f16.gguf` | llama-embedding CPU | Full-precision GGUF | +| I2_S GGUF | `bitnet-embeddings-0.6b-f16-i2_s.gguf` | llama-embedding CPU | 2-bit ternary packed GGUF | + +The evaluation script selects **33 tasks** (~1/4 of the full 131-task benchmark) covering all 9 task types, chosen for fast execution while maintaining representative coverage. + +## Prerequisites + +### 1. Build BitNet + +```bash +cd /home/huangxin/code_list/BitNet + +# Clean build +rm -rf build +cmake -B build -DBITNET_X86_TL2=OFF \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ +cmake --build build --config Release -j$(nproc) + +# Verify llama-embedding binary exists +ls build/bin/llama-embedding +``` + +### 2. Model Files + +Ensure the following files exist under the model directory: + +``` +/data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/ +├── model.safetensors # Original PyTorch weights +├── bitnet-embeddings-0.6b-f16.gguf # F16 GGUF (converted) +├── bitnet-embeddings-0.6b-f16-i2_s.gguf # I2_S GGUF (converted) +├── config.json +├── bitnet_config.json +├── tokenizer.json +├── tokenizer_config.json +└── ... +``` + +If GGUF files don't exist yet, convert them: + +```bash +# F16 GGUF +python utils/convert-bitnet-embedding-to-gguf.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b + +# I2_S GGUF +python utils/convert-bitnet-embedding-to-gguf.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --quantize i2_s +``` + +### 3. Python Dependencies + +```bash +pip install mteb transformers safetensors torch numpy +``` + +The safetensors model evaluation also depends on the `bitnet-embeddings-v260420` project: + +``` +/home/huangxin/code_list/bitnet-embeddings-v260420/src/ +├── search/simple_encoder.py # SimpleEncoder for PyTorch inference +├── inference/eval_instructions.py # Task-specific instruction prompts +├── data_utils.py +├── utils.py +└── logger_config.py +``` + +## Running Evaluations + +### Evaluate All 3 Models (Full Run) + +```bash +cd /home/huangxin/code_list/BitNet + +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --build-dir /home/huangxin/code_list/BitNet/build \ + --output-dir /home/huangxin/code_list/BitNet/eval_results \ + --model-type all \ + --threads 8 \ + --overwrite +``` + +### Evaluate a Single Model + +```bash +# safetensors (PyTorch GPU) +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --model-type safetensors + +# F16 GGUF (CPU) +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --model-type f16 \ + --threads 8 + +# I2_S GGUF (CPU) +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --model-type i2s \ + --threads 8 +``` + +### Quick Test (Dry Run, 1 Task Only) + +```bash +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --model-type all \ + --dry-run +``` + +### Evaluate Specific Tasks + +```bash +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --model-type all \ + --tasks "STSBenchmark,ArguAna,DBpediaClassification" +``` + +### Background Execution with Log + +```bash +python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --model-type all \ + --overwrite \ + 2>&1 | tee /home/huangxin/code_list/BitNet/eval_results/eval_log.txt & +``` + +## Monitoring Progress + +```bash +# Check if the process is running +ps aux | grep eval_mmteb + +# View live log output +tail -f /home/huangxin/code_list/BitNet/eval_results/eval_log.txt + +# Count completed evaluations +grep "Finished evaluation" /home/huangxin/code_list/BitNet/eval_results/eval_log.txt +``` + +## Command-Line Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `--model-dir` | (required) | Path to model directory with safetensors and GGUF files | +| `--build-dir` | `BitNet/build` | Path to BitNet build directory containing `bin/llama-embedding` | +| `--output-dir` | `BitNet/eval_results` | Output directory for MTEB results | +| `--model-type` | `all` | Model to evaluate: `all`, `safetensors`, `f16`, `i2s` | +| `--threads` | `8` | Number of CPU threads for llama-embedding | +| `--tasks` | (33 selected) | Comma-separated task names to evaluate | +| `--dry-run` | `false` | Run only 1 task for quick validation | +| `--overwrite` | `false` | Overwrite existing evaluation results | + +## Selected Tasks (33 / 131) + +| Task Type | Count | Tasks | +|-----------|-------|-------| +| BitextMining | 3 | BornholmBitextMining, NusaXBitextMining, Tatoeba | +| Classification | 11 | DBpediaClassification, FinancialPhrasebankClassification, PoemSentimentClassification, TweetTopicSingleClassification, AmazonCounterfactualClassification, MassiveIntentClassification, NordicLangClassification, KorSarcasmClassification, SinhalaNewsClassification, DalajClassification, IndicLangClassification | +| Clustering | 4 | WikiCitiesClustering, SIB200ClusteringS2S, ArXivHierarchicalClusteringS2S, StackExchangeClustering.v2 | +| Retrieval | 4 | ArguAna, SCIDOCS, HagridRetrieval, StackOverflowQA | +| InstructionReranking | 1 | Core17InstructionRetrieval | +| MultilabelClassification | 1 | KorHateSpeechMLClassification | +| PairClassification | 3 | SprintDuplicateQuestions, TwitterURLCorpus, XNLI | +| Reranking | 2 | AlloprofReranking, T2Reranking | +| STS | 4 | STSBenchmark, STS17, SICK-R, GermanSTSBenchmark | + +## Expected Runtime + +| Model Type | Hardware | Estimated Time (33 tasks) | +|------------|----------|---------------------------| +| safetensors | 8x GPU | ~20-30 minutes | +| F16 GGUF | 8-thread CPU | ~1-2 hours | +| I2_S GGUF | 8-thread CPU | ~1-2 hours | +| All 3 models | GPU + CPU | ~3-5 hours total | + +## Output + +When evaluating multiple models, the script prints a comparison table at the end: + +``` +========================================================================== + MTEB v2 Evaluation Results Comparison +========================================================================== +Task safetensors f16 i2s +-------------------------------------------------------------------------- +BornholmBitextMining 0.8234 0.8230 0.8228 +STSBenchmark 0.8567 0.8565 0.8560 +... +-------------------------------------------------------------------------- +AVERAGE 0.XXXX 0.XXXX 0.XXXX +========================================================================== +``` + +Per-model detailed results are saved under `--output-dir/{model_type}/`. diff --git a/eval_results/eval_log.txt b/eval_results/eval_log.txt new file mode 100644 index 000000000..888843389 --- /dev/null +++ b/eval_results/eval_log.txt @@ -0,0 +1,1202 @@ +[2026-05-19 00:47:59,348 INFO MainThread eval_mmteb_v2.py:443] Will evaluate 8 tasks: ['BornholmBitextMining', 'FinancialPhrasebankClassification', 'PoemSentimentClassification', 'KorSarcasmClassification', 'KorHateSpeechMLClassification', 'SICK-R', 'STSBenchmark', 'STS17'] +[2026-05-19 00:47:59,349 INFO MainThread eval_mmteb_v2.py:455] +============================================================ +[2026-05-19 00:47:59,349 INFO MainThread eval_mmteb_v2.py:456] Evaluating model type: safetensors +[2026-05-19 00:47:59,349 INFO MainThread eval_mmteb_v2.py:457] ============================================================ +[2026-05-19 00:47:59,454 INFO MainThread simple_encoder.py:65] Use varlen batching: False +Some weights of the model checkpoint at /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b were not used when initializing Qwen3Model: ['layers.0.mlp.down_proj.norm.weight', 'layers.0.mlp.gate_proj.norm.weight', 'layers.0.mlp.up_proj.norm.weight', 'layers.0.self_attn.k_proj.norm.weight', 'layers.0.self_attn.o_proj.norm.weight', 'layers.0.self_attn.q_proj.norm.weight', 'layers.0.self_attn.v_proj.norm.weight', 'layers.1.mlp.down_proj.norm.weight', 'layers.1.mlp.gate_proj.norm.weight', 'layers.1.mlp.up_proj.norm.weight', 'layers.1.self_attn.k_proj.norm.weight', 'layers.1.self_attn.o_proj.norm.weight', 'layers.1.self_attn.q_proj.norm.weight', 'layers.1.self_attn.v_proj.norm.weight', 'layers.10.mlp.down_proj.norm.weight', 'layers.10.mlp.gate_proj.norm.weight', 'layers.10.mlp.up_proj.norm.weight', 'layers.10.self_attn.k_proj.norm.weight', 'layers.10.self_attn.o_proj.norm.weight', 'layers.10.self_attn.q_proj.norm.weight', 'layers.10.self_attn.v_proj.norm.weight', 'layers.11.mlp.down_proj.norm.weight', 'layers.11.mlp.gate_proj.norm.weight', 'layers.11.mlp.up_proj.norm.weight', 'layers.11.self_attn.k_proj.norm.weight', 'layers.11.self_attn.o_proj.norm.weight', 'layers.11.self_attn.q_proj.norm.weight', 'layers.11.self_attn.v_proj.norm.weight', 'layers.12.mlp.down_proj.norm.weight', 'layers.12.mlp.gate_proj.norm.weight', 'layers.12.mlp.up_proj.norm.weight', 'layers.12.self_attn.k_proj.norm.weight', 'layers.12.self_attn.o_proj.norm.weight', 'layers.12.self_attn.q_proj.norm.weight', 'layers.12.self_attn.v_proj.norm.weight', 'layers.13.mlp.down_proj.norm.weight', 'layers.13.mlp.gate_proj.norm.weight', 'layers.13.mlp.up_proj.norm.weight', 'layers.13.self_attn.k_proj.norm.weight', 'layers.13.self_attn.o_proj.norm.weight', 'layers.13.self_attn.q_proj.norm.weight', 'layers.13.self_attn.v_proj.norm.weight', 'layers.14.mlp.down_proj.norm.weight', 'layers.14.mlp.gate_proj.norm.weight', 'layers.14.mlp.up_proj.norm.weight', 'layers.14.self_attn.k_proj.norm.weight', 'layers.14.self_attn.o_proj.norm.weight', 'layers.14.self_attn.q_proj.norm.weight', 'layers.14.self_attn.v_proj.norm.weight', 'layers.15.mlp.down_proj.norm.weight', 'layers.15.mlp.gate_proj.norm.weight', 'layers.15.mlp.up_proj.norm.weight', 'layers.15.self_attn.k_proj.norm.weight', 'layers.15.self_attn.o_proj.norm.weight', 'layers.15.self_attn.q_proj.norm.weight', 'layers.15.self_attn.v_proj.norm.weight', 'layers.16.mlp.down_proj.norm.weight', 'layers.16.mlp.gate_proj.norm.weight', 'layers.16.mlp.up_proj.norm.weight', 'layers.16.self_attn.k_proj.norm.weight', 'layers.16.self_attn.o_proj.norm.weight', 'layers.16.self_attn.q_proj.norm.weight', 'layers.16.self_attn.v_proj.norm.weight', 'layers.17.mlp.down_proj.norm.weight', 'layers.17.mlp.gate_proj.norm.weight', 'layers.17.mlp.up_proj.norm.weight', 'layers.17.self_attn.k_proj.norm.weight', 'layers.17.self_attn.o_proj.norm.weight', 'layers.17.self_attn.q_proj.norm.weight', 'layers.17.self_attn.v_proj.norm.weight', 'layers.18.mlp.down_proj.norm.weight', 'layers.18.mlp.gate_proj.norm.weight', 'layers.18.mlp.up_proj.norm.weight', 'layers.18.self_attn.k_proj.norm.weight', 'layers.18.self_attn.o_proj.norm.weight', 'layers.18.self_attn.q_proj.norm.weight', 'layers.18.self_attn.v_proj.norm.weight', 'layers.19.mlp.down_proj.norm.weight', 'layers.19.mlp.gate_proj.norm.weight', 'layers.19.mlp.up_proj.norm.weight', 'layers.19.self_attn.k_proj.norm.weight', 'layers.19.self_attn.o_proj.norm.weight', 'layers.19.self_attn.q_proj.norm.weight', 'layers.19.self_attn.v_proj.norm.weight', 'layers.2.mlp.down_proj.norm.weight', 'layers.2.mlp.gate_proj.norm.weight', 'layers.2.mlp.up_proj.norm.weight', 'layers.2.self_attn.k_proj.norm.weight', 'layers.2.self_attn.o_proj.norm.weight', 'layers.2.self_attn.q_proj.norm.weight', 'layers.2.self_attn.v_proj.norm.weight', 'layers.20.mlp.down_proj.norm.weight', 'layers.20.mlp.gate_proj.norm.weight', 'layers.20.mlp.up_proj.norm.weight', 'layers.20.self_attn.k_proj.norm.weight', 'layers.20.self_attn.o_proj.norm.weight', 'layers.20.self_attn.q_proj.norm.weight', 'layers.20.self_attn.v_proj.norm.weight', 'layers.21.mlp.down_proj.norm.weight', 'layers.21.mlp.gate_proj.norm.weight', 'layers.21.mlp.up_proj.norm.weight', 'layers.21.self_attn.k_proj.norm.weight', 'layers.21.self_attn.o_proj.norm.weight', 'layers.21.self_attn.q_proj.norm.weight', 'layers.21.self_attn.v_proj.norm.weight', 'layers.22.mlp.down_proj.norm.weight', 'layers.22.mlp.gate_proj.norm.weight', 'layers.22.mlp.up_proj.norm.weight', 'layers.22.self_attn.k_proj.norm.weight', 'layers.22.self_attn.o_proj.norm.weight', 'layers.22.self_attn.q_proj.norm.weight', 'layers.22.self_attn.v_proj.norm.weight', 'layers.23.mlp.down_proj.norm.weight', 'layers.23.mlp.gate_proj.norm.weight', 'layers.23.mlp.up_proj.norm.weight', 'layers.23.self_attn.k_proj.norm.weight', 'layers.23.self_attn.o_proj.norm.weight', 'layers.23.self_attn.q_proj.norm.weight', 'layers.23.self_attn.v_proj.norm.weight', 'layers.24.mlp.down_proj.norm.weight', 'layers.24.mlp.gate_proj.norm.weight', 'layers.24.mlp.up_proj.norm.weight', 'layers.24.self_attn.k_proj.norm.weight', 'layers.24.self_attn.o_proj.norm.weight', 'layers.24.self_attn.q_proj.norm.weight', 'layers.24.self_attn.v_proj.norm.weight', 'layers.25.mlp.down_proj.norm.weight', 'layers.25.mlp.gate_proj.norm.weight', 'layers.25.mlp.up_proj.norm.weight', 'layers.25.self_attn.k_proj.norm.weight', 'layers.25.self_attn.o_proj.norm.weight', 'layers.25.self_attn.q_proj.norm.weight', 'layers.25.self_attn.v_proj.norm.weight', 'layers.26.mlp.down_proj.norm.weight', 'layers.26.mlp.gate_proj.norm.weight', 'layers.26.mlp.up_proj.norm.weight', 'layers.26.self_attn.k_proj.norm.weight', 'layers.26.self_attn.o_proj.norm.weight', 'layers.26.self_attn.q_proj.norm.weight', 'layers.26.self_attn.v_proj.norm.weight', 'layers.27.mlp.down_proj.norm.weight', 'layers.27.mlp.gate_proj.norm.weight', 'layers.27.mlp.up_proj.norm.weight', 'layers.27.self_attn.k_proj.norm.weight', 'layers.27.self_attn.o_proj.norm.weight', 'layers.27.self_attn.q_proj.norm.weight', 'layers.27.self_attn.v_proj.norm.weight', 'layers.3.mlp.down_proj.norm.weight', 'layers.3.mlp.gate_proj.norm.weight', 'layers.3.mlp.up_proj.norm.weight', 'layers.3.self_attn.k_proj.norm.weight', 'layers.3.self_attn.o_proj.norm.weight', 'layers.3.self_attn.q_proj.norm.weight', 'layers.3.self_attn.v_proj.norm.weight', 'layers.4.mlp.down_proj.norm.weight', 'layers.4.mlp.gate_proj.norm.weight', 'layers.4.mlp.up_proj.norm.weight', 'layers.4.self_attn.k_proj.norm.weight', 'layers.4.self_attn.o_proj.norm.weight', 'layers.4.self_attn.q_proj.norm.weight', 'layers.4.self_attn.v_proj.norm.weight', 'layers.5.mlp.down_proj.norm.weight', 'layers.5.mlp.gate_proj.norm.weight', 'layers.5.mlp.up_proj.norm.weight', 'layers.5.self_attn.k_proj.norm.weight', 'layers.5.self_attn.o_proj.norm.weight', 'layers.5.self_attn.q_proj.norm.weight', 'layers.5.self_attn.v_proj.norm.weight', 'layers.6.mlp.down_proj.norm.weight', 'layers.6.mlp.gate_proj.norm.weight', 'layers.6.mlp.up_proj.norm.weight', 'layers.6.self_attn.k_proj.norm.weight', 'layers.6.self_attn.o_proj.norm.weight', 'layers.6.self_attn.q_proj.norm.weight', 'layers.6.self_attn.v_proj.norm.weight', 'layers.7.mlp.down_proj.norm.weight', 'layers.7.mlp.gate_proj.norm.weight', 'layers.7.mlp.up_proj.norm.weight', 'layers.7.self_attn.k_proj.norm.weight', 'layers.7.self_attn.o_proj.norm.weight', 'layers.7.self_attn.q_proj.norm.weight', 'layers.7.self_attn.v_proj.norm.weight', 'layers.8.mlp.down_proj.norm.weight', 'layers.8.mlp.gate_proj.norm.weight', 'layers.8.mlp.up_proj.norm.weight', 'layers.8.self_attn.k_proj.norm.weight', 'layers.8.self_attn.o_proj.norm.weight', 'layers.8.self_attn.q_proj.norm.weight', 'layers.8.self_attn.v_proj.norm.weight', 'layers.9.mlp.down_proj.norm.weight', 'layers.9.mlp.gate_proj.norm.weight', 'layers.9.mlp.up_proj.norm.weight', 'layers.9.self_attn.k_proj.norm.weight', 'layers.9.self_attn.o_proj.norm.weight', 'layers.9.self_attn.q_proj.norm.weight', 'layers.9.self_attn.v_proj.norm.weight'] +- This IS expected if you are initializing Qwen3Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing Qwen3Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +[2026-05-19 00:47:59,527 INFO MainThread bitnet.py:280] Loaded BitNet config from /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet_config.json +[2026-05-19 00:48:02,611 INFO MainThread bitnet.py:243] Replaced 196 nn.Linear layers with BitLinear (method=minmax, standard=False) +The tokenizer you are loading from '/data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +[2026-05-19 00:48:03,700 INFO MainThread simple_encoder.py:95] Loaded model /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b with pool_type=last, l2_normalize=True, dtype=torch.bfloat16, max_length=512, gpu_count=2 +[2026-05-19 00:48:03,700 INFO MainThread eval_mmteb_v2.py:318] Loaded model: microsoft/bitnet-embeddings-0.6b-safetensors +[2026-05-19 00:48:03,700 INFO MainThread eval_mmteb_v2.py:322] [1/8] Evaluating: BornholmBitextMining +[2026-05-19 00:48:06,929 INFO MainThread bitext_mining.py:104] Task: BornholmBitextMining, split: test, subset: default. Running... + 0%| | 0/2 [00:00 +inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) { +#if defined __AVX2__ + __m256 vec_lut[16]; + const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0); + float scales = *lut_scales; + __m256i shuffle_mask = _mm256_set_epi8( + 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, + 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, + 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, + 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00 + ); +#pragma unroll + for (int k = 0; k < act_k / 24; ++k) { + __m256 vec_b0 = _mm256_i32gather_ps(b + k * 24 + 0, vec_bi, 1); + __m256 vec_b1 = _mm256_i32gather_ps(b + k * 24 + 1, vec_bi, 1); + __m256 vec_b2 = _mm256_i32gather_ps(b + k * 24 + 2, vec_bi, 1); + + __m256i vec_b0i = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(vec_b0, _mm256_set1_ps(scales)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m256i vec_b1i = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(vec_b1, _mm256_set1_ps(scales)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m256i vec_b2i = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(vec_b2, _mm256_set1_ps(scales)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + + vec_lut[15] = _mm256_setzero_si256(); + vec_lut[14] = _mm256_setzero_si256(); + vec_lut[13] = vec_b0i; + vec_lut[13] = _mm256_add_epi32(vec_lut[13], vec_b1i); + vec_lut[13] = _mm256_add_epi32(vec_lut[13], vec_b2i); + vec_lut[12] = vec_b0i; + vec_lut[12] = _mm256_add_epi32(vec_lut[12], vec_b1i); + vec_lut[11] = vec_b0i; + vec_lut[11] = _mm256_add_epi32(vec_lut[11], vec_b1i); + vec_lut[11] = _mm256_sub_epi32(vec_lut[11], vec_b2i); + vec_lut[10] = vec_b0i; + vec_lut[10] = _mm256_add_epi32(vec_lut[10], vec_b2i); + vec_lut[9] = vec_b0i; + vec_lut[8] = vec_b0i; + vec_lut[8] = _mm256_sub_epi32(vec_lut[8], vec_b2i); + vec_lut[7] = vec_b0i; + vec_lut[7] = _mm256_sub_epi32(vec_lut[7], vec_b1i); + vec_lut[7] = _mm256_add_epi32(vec_lut[7], vec_b2i); + vec_lut[6] = vec_b0i; + vec_lut[6] = _mm256_sub_epi32(vec_lut[6], vec_b1i); + vec_lut[5] = vec_b0i; + vec_lut[5] = _mm256_sub_epi32(vec_lut[5], vec_b1i); + vec_lut[5] = _mm256_sub_epi32(vec_lut[5], vec_b2i); + vec_lut[4] = vec_b1i; + vec_lut[4] = _mm256_add_epi32(vec_lut[4], vec_b2i); + vec_lut[3] = vec_b1i; + vec_lut[2] = vec_b1i; + vec_lut[2] = _mm256_sub_epi32(vec_lut[2], vec_b2i); + vec_lut[1] = vec_b2i; + vec_lut[0] = _mm256_setzero_si256(); + __m256i ix[16]; + +#pragma unroll + for (int g = 0; g < 16; ++g) { + ix[g] = vec_lut[g]; + } + + Transpose_8_8(&(ix[0]), &(ix[1]), &(ix[2]), &(ix[3]), &(ix[4]), &(ix[5]),&(ix[6]), &(ix[7])); + Transpose_8_8(&(ix[8]), &(ix[9]), &(ix[10]), &(ix[11]), &(ix[12]), &(ix[13]),&(ix[14]), &(ix[15])); + +#pragma unroll + for (int g = 0; g < 8; ++g) { + ix[g] = _mm256_packs_epi32(ix[g], ix[g + 8]); + ix[g] = _mm256_permute4x64_epi64(ix[g], _MM_SHUFFLE(3, 1, 2, 0)); + ix[g] = _mm256_shuffle_epi8(ix[g], shuffle_mask); + ix[g] = _mm256_permute4x64_epi64(ix[g], _MM_SHUFFLE(3, 1, 2, 0)); + } + int8_t* qlut_i8 = reinterpret_cast(qlut); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 0 * 32 + 0), ix[0]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 1 * 32 + 0), ix[1]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 2 * 32 + 0), ix[2]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 3 * 32 + 0), ix[3]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 4 * 32 + 0), ix[4]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 5 * 32 + 0), ix[5]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 6 * 32 + 0), ix[6]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 7 * 32 + 0), ix[7]); + + } + + *lut_scales = scales; +#endif + return 0; +} + +template +inline int32_t two_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) { +#if defined __AVX2__ + __m256 vec_lut[16]; + const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0); + float scales = *lut_scales; + __m256i shuffle_mask = _mm256_set_epi8( + 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, + 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, + 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, + 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00 + ); +#pragma unroll + for (int k = 0; k < act_k / 16; ++k) { + __m256 vec_b0f = _mm256_i32gather_ps(b + k * 16 + 0, vec_bi, 1); + __m256 vec_b1f = _mm256_i32gather_ps(b + k * 16 + 1, vec_bi, 1); + + __m256i vec_b0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(vec_b0f, _mm256_set1_ps(scales)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + __m256i vec_b1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(vec_b1f, _mm256_set1_ps(scales)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + vec_lut[15] = _mm256_setzero_si256(); + vec_lut[14] = _mm256_setzero_si256(); + vec_lut[13] = _mm256_setzero_si256(); + vec_lut[12] = _mm256_setzero_si256(); + vec_lut[11] = _mm256_setzero_si256(); + vec_lut[10] = _mm256_setzero_si256(); + vec_lut[9] = _mm256_setzero_si256(); + vec_lut[8] = vec_b0; + vec_lut[8] = _mm256_add_epi32(vec_lut[8], vec_b1); + vec_lut[7] = vec_b0; + vec_lut[6] = vec_b0; + vec_lut[6] = _mm256_sub_epi32(vec_lut[6], vec_b1); + vec_lut[5] = vec_b1; + vec_lut[4] = _mm256_setzero_si256(); + vec_lut[3] = _mm256_setzero_si256(); + vec_lut[3] = _mm256_sub_epi32(vec_lut[3], vec_b1); + vec_lut[2] = _mm256_setzero_si256(); + vec_lut[2] = _mm256_sub_epi32(vec_lut[2], vec_b0); + vec_lut[2] = _mm256_add_epi32(vec_lut[2], vec_b1); + vec_lut[1] = _mm256_setzero_si256(); + vec_lut[1] = _mm256_sub_epi32(vec_lut[1], vec_b0); + vec_lut[0] = _mm256_setzero_si256(); + vec_lut[0] = _mm256_sub_epi32(vec_lut[0], vec_b0); + vec_lut[0] = _mm256_sub_epi32(vec_lut[0], vec_b1); + + __m256i ix[16]; +#pragma unroll + for (int g = 0; g < 16; ++g) { + ix[g] = vec_lut[g]; + } + + Transpose_8_8(&(ix[0]), &(ix[1]), &(ix[2]), &(ix[3]), &(ix[4]), &(ix[5]),&(ix[6]), &(ix[7])); + Transpose_8_8(&(ix[8]), &(ix[9]), &(ix[10]), &(ix[11]), &(ix[12]), &(ix[13]),&(ix[14]), &(ix[15])); + +#pragma unroll + for (int g = 0; g < 8; ++g) { + ix[g] = _mm256_packs_epi32(ix[g], ix[g + 8]); + ix[g] = _mm256_permute4x64_epi64(ix[g], _MM_SHUFFLE(3, 1, 2, 0)); + ix[g] = _mm256_shuffle_epi8(ix[g], shuffle_mask); + ix[g] = _mm256_permute4x64_epi64(ix[g], _MM_SHUFFLE(3, 1, 2, 0)); + } + + int8_t* qlut_i8 = reinterpret_cast(qlut); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 0 * 32 + 0), ix[0]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 1 * 32 + 0), ix[1]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 2 * 32 + 0), ix[2]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 3 * 32 + 0), ix[3]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 4 * 32 + 0), ix[4]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 5 * 32 + 0), ix[5]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 6 * 32 + 0), ix[6]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(qlut_i8 + k * 256 + 7 * 32 + 0), ix[7]); + + } + *lut_scales = scales; +#endif + return 0; +} +static bool is_type_supported(enum ggml_type type) { + if (type == GGML_TYPE_Q4_0 || + type == GGML_TYPE_TL2) { + return true; + } else { + return false; + } +} +#include + +#define BM3200_8640 160 +#define BBK3200_8640 96 +template +inline void three_tbl_impl_3200_8640(int32_t* c, int8_t* lut, uint8_t* a, uint8_t* sign) { +#ifdef __AVX2__ + const __m256i vec_mask = _mm256_set1_epi8(0x0f); + const __m256i vec_sign_mask = _mm256_set1_epi16(0x8000); + const __m256i vec_zero = _mm256_set1_epi8(0x00); + const __m256i vec_one = _mm256_set1_epi8(0xff); + const int KK = BBK3200_8640 / 3; +#pragma unroll + for (int i = 0; i < BM3200_8640; i += 32) { + __m256i vec_as[KK / 2]; + __m256i vec_signs[KK / 8]; + #pragma unroll + for (int ai = 0; ai < KK / 2; ai++) { + vec_as[ai] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + ai * 32)); + } + #pragma unroll + for (int as = 0; as < KK / 8; as++) { + vec_signs[as] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(sign + i * KK / 8 + as * 32)); + } +#pragma unroll + for (int bs = 0; bs < batch_size; bs++) { + __m256i vec_c0 = _mm256_setzero_si256(); + __m256i vec_c1 = _mm256_setzero_si256(); +#pragma unroll + for (int k = 0; k < KK / 8; k++) { + __m256i vec_sign = vec_signs[k]; + __m256i vec_a_0 = vec_as[k * 4 + 0]; + __m128i vec_k1_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0)), 15); + __m256i vec_sign_left_lo_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 1)), 15); + __m256i vec_v_top_0 = _mm256_and_si256(_mm256_srli_epi16(vec_a_0, 4), vec_mask); + __m256i vec_v_top_fir_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_0, vec_k1_0), vec_v_top_0); + __m256i vec_v_top_sec_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_0, vec_k2_0), vec_v_top_0); + __m256i vec_sign_right_hi_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 2)), 15); + __m256i vec_sign_right_lo_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 3)), 15); + __m256i vec_v_bot_0 = _mm256_and_si256(vec_a_0, vec_mask); + __m256i vec_v_bot_fir_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_0, vec_k3_0), vec_v_bot_0); + __m256i vec_v_bot_sec_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_0, vec_k4_0), vec_v_bot_0); + __m256i vec_v_top_lo_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_0, vec_v_top_sec_0), vec_sign_left_lo_0), vec_sign_left_lo_0); + __m256i vec_v_top_hi_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_0, vec_v_top_sec_0), vec_sign_left_hi_0), vec_sign_left_hi_0); + __m256i vec_v_bot_lo_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_0, vec_v_bot_sec_0), vec_sign_right_lo_0), vec_sign_right_lo_0); + __m256i vec_v_bot_hi_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_0, vec_v_bot_sec_0), vec_sign_right_hi_0), vec_sign_right_hi_0); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_0); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_0); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_0); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_0); + __m256i vec_a_1 = vec_as[k * 4 + 1]; + __m128i vec_k1_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1)), 15); + __m256i vec_sign_left_lo_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 1)), 15); + __m256i vec_v_top_1 = _mm256_and_si256(_mm256_srli_epi16(vec_a_1, 4), vec_mask); + __m256i vec_v_top_fir_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_1, vec_k1_1), vec_v_top_1); + __m256i vec_v_top_sec_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_1, vec_k2_1), vec_v_top_1); + __m256i vec_sign_right_hi_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 2)), 15); + __m256i vec_sign_right_lo_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 3)), 15); + __m256i vec_v_bot_1 = _mm256_and_si256(vec_a_1, vec_mask); + __m256i vec_v_bot_fir_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_1, vec_k3_1), vec_v_bot_1); + __m256i vec_v_bot_sec_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_1, vec_k4_1), vec_v_bot_1); + __m256i vec_v_top_lo_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_1, vec_v_top_sec_1), vec_sign_left_lo_1), vec_sign_left_lo_1); + __m256i vec_v_top_hi_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_1, vec_v_top_sec_1), vec_sign_left_hi_1), vec_sign_left_hi_1); + __m256i vec_v_bot_lo_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_1, vec_v_bot_sec_1), vec_sign_right_lo_1), vec_sign_right_lo_1); + __m256i vec_v_bot_hi_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_1, vec_v_bot_sec_1), vec_sign_right_hi_1), vec_sign_right_hi_1); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_1); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_1); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_1); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_1); + __m256i vec_a_2 = vec_as[k * 4 + 2]; + __m128i vec_k1_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2)), 15); + __m256i vec_sign_left_lo_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 1)), 15); + __m256i vec_v_top_2 = _mm256_and_si256(_mm256_srli_epi16(vec_a_2, 4), vec_mask); + __m256i vec_v_top_fir_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_2, vec_k1_2), vec_v_top_2); + __m256i vec_v_top_sec_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_2, vec_k2_2), vec_v_top_2); + __m256i vec_sign_right_hi_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 2)), 15); + __m256i vec_sign_right_lo_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 3)), 15); + __m256i vec_v_bot_2 = _mm256_and_si256(vec_a_2, vec_mask); + __m256i vec_v_bot_fir_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_2, vec_k3_2), vec_v_bot_2); + __m256i vec_v_bot_sec_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_2, vec_k4_2), vec_v_bot_2); + __m256i vec_v_top_lo_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_2, vec_v_top_sec_2), vec_sign_left_lo_2), vec_sign_left_lo_2); + __m256i vec_v_top_hi_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_2, vec_v_top_sec_2), vec_sign_left_hi_2), vec_sign_left_hi_2); + __m256i vec_v_bot_lo_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_2, vec_v_bot_sec_2), vec_sign_right_lo_2), vec_sign_right_lo_2); + __m256i vec_v_bot_hi_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_2, vec_v_bot_sec_2), vec_sign_right_hi_2), vec_sign_right_hi_2); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_2); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_2); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_2); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_2); + __m256i vec_a_3 = vec_as[k * 4 + 3]; + __m128i vec_k1_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3)), 15); + __m256i vec_sign_left_lo_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 1)), 15); + __m256i vec_v_top_3 = _mm256_and_si256(_mm256_srli_epi16(vec_a_3, 4), vec_mask); + __m256i vec_v_top_fir_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_3, vec_k1_3), vec_v_top_3); + __m256i vec_v_top_sec_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_3, vec_k2_3), vec_v_top_3); + __m256i vec_sign_right_hi_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 2)), 15); + __m256i vec_sign_right_lo_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 3)), 15); + __m256i vec_v_bot_3 = _mm256_and_si256(vec_a_3, vec_mask); + __m256i vec_v_bot_fir_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_3, vec_k3_3), vec_v_bot_3); + __m256i vec_v_bot_sec_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_3, vec_k4_3), vec_v_bot_3); + __m256i vec_v_top_lo_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_3, vec_v_top_sec_3), vec_sign_left_lo_3), vec_sign_left_lo_3); + __m256i vec_v_top_hi_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_3, vec_v_top_sec_3), vec_sign_left_hi_3), vec_sign_left_hi_3); + __m256i vec_v_bot_lo_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_3, vec_v_bot_sec_3), vec_sign_right_lo_3), vec_sign_right_lo_3); + __m256i vec_v_bot_hi_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_3, vec_v_bot_sec_3), vec_sign_right_hi_3), vec_sign_right_hi_3); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_3); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_3); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_3); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_3); + } + __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_8640 * bs)); + __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_8640 * bs)); + __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_8640 * bs)); + __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_8640 * bs)); + vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0))); + vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1))); + vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1))); + vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1))); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_8640 * bs), vec_gc0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_8640 * bs), vec_gc1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_8640 * bs), vec_gc2); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_8640 * bs), vec_gc3); + } + } +#endif +} + +template +inline int32_t two_tbl_impl3200_8640(int32_t* c, int8_t* lut, uint8_t* a) { +#ifdef __AVX2__ + const __m256i vec_mask = _mm256_set1_epi8(0x0f); + const int KK = BK2 / 2; +#pragma unroll + for (int i = 0; i < BM3200_8640; i += 32) { + __m256i vec_as[KK / 2]; + #pragma unroll + for (int ai = 0; ai < KK / 2; ai++) { + vec_as[ai] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + ai * 32)); + } +#pragma unroll + for (int bs = 0; bs < batch_size; bs++) { + __m256i vec_c0 = _mm256_setzero_si256(); + __m256i vec_c1 = _mm256_setzero_si256(); +#pragma unroll + for (int k = 0; k < KK / 8; k++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + __m256i vec_a = vec_as[k * 4 + j]; + + __m128i vec_k1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 0 + K2 / 2 * 32 * bs)); + __m128i vec_k2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 16 + K2 / 2 * 32 * bs)); + __m128i vec_k3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 32 + K2 / 2 * 32 * bs)); + __m128i vec_k4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 48 + K2 / 2 * 32 * bs)); + + __m256i vec_v_top = _mm256_and_si256(_mm256_srli_epi16(vec_a, 4), vec_mask); + __m256i vec_v_top_fir = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1, vec_k1), vec_v_top); + __m256i vec_v_top_sec = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2, vec_k2), vec_v_top); + + __m256i vec_v_bot = _mm256_and_si256(vec_a, vec_mask); + __m256i vec_v_bot_fir = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3, vec_k3), vec_v_bot); + __m256i vec_v_bot_sec = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4, vec_k4), vec_v_bot); + + __m256i vec_v_top_lo = _mm256_unpackhi_epi8(vec_v_top_fir, vec_v_top_sec); + __m256i vec_v_top_hi = _mm256_unpacklo_epi8(vec_v_top_fir, vec_v_top_sec); + __m256i vec_v_bot_lo = _mm256_unpackhi_epi8(vec_v_bot_fir, vec_v_bot_sec); + __m256i vec_v_bot_hi = _mm256_unpacklo_epi8(vec_v_bot_fir, vec_v_bot_sec); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo); + } + } + + __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_8640 * bs)); + __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_8640 * bs)); + __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_8640 * bs)); + __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_8640 * bs)); + + vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0))); + vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1))); + vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1))); + vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1))); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_8640 * bs), vec_gc0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_8640 * bs), vec_gc1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_8640 * bs), vec_gc2); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_8640 * bs), vec_gc3); + } + } +#endif + return 0; +} + +template +int32_t three_qgemm_lut_3200_8640(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) { + alignas(32) uint32_t CBits[BATCH_SIZE * BM3200_8640]; + memset(&(CBits[0]), 0, BATCH_SIZE * BM3200_8640 * sizeof(int32_t)); +#pragma unroll + for (int32_t k_outer = 0; k_outer < 8640 / BBK3200_8640; ++k_outer) { + three_tbl_impl_3200_8640((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK3200_8640 / 3 * 32)])), (&(((uint8_t*)A)[(k_outer * BBK3200_8640 / 3 / 2 * BM3200_8640)])), (&(((uint8_t*)sign)[(k_outer * BBK3200_8640 / 3 / 8 * BM3200_8640)]))); + } +#pragma unroll + for (int bs = 0; bs < BATCH_SIZE; bs++) { +#pragma unroll + for (int i = 0; i < BM3200_8640; i++) { + ((int32_t*)C)[i] = (int32_t)(((int32_t*)CBits)[i + bs * BM3200_8640]); + } + } + return 0; +} + +template +int32_t two_qgemm_lut_3200_8640(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) { + alignas(32) uint32_t CBits[BATCH_SIZE * BM3200_8640]; + memset(&(CBits[0]), 0, BATCH_SIZE * BM3200_8640 * sizeof(int32_t)); +#pragma unroll + for (int32_t k_outer = 0; k_outer < 0 / 32; ++k_outer) { + two_tbl_impl3200_8640((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BK2 / 2 * 32)])), (&(((uint8_t*)A)[(k_outer * BK2 / 2 / 2 * BM3200_8640)]))); + } +#pragma unroll + for (int bs = 0; bs < BATCH_SIZE; bs++) { +#pragma unroll + for (int i = 0; i < BM3200_8640; i++) { + ((int32_t*)C)[i] += (int32_t)(((int32_t*)CBits)[i + bs * BM3200_8640]); + ((float*)C)[i] = (float)(((int32_t*)C)[i]) / ((float*)LUT_Scales)[bs] * ((float*)Scales)[0]; + } + } + return 0; +} + +#include + +#define BM3200_3200 320 +#define BBK3200_3200 96 +template +inline void three_tbl_impl_3200_3200(int32_t* c, int8_t* lut, uint8_t* a, uint8_t* sign) { +#ifdef __AVX2__ + const __m256i vec_mask = _mm256_set1_epi8(0x0f); + const __m256i vec_sign_mask = _mm256_set1_epi16(0x8000); + const __m256i vec_zero = _mm256_set1_epi8(0x00); + const __m256i vec_one = _mm256_set1_epi8(0xff); + const int KK = BBK3200_3200 / 3; +#pragma unroll + for (int i = 0; i < BM3200_3200; i += 32) { + __m256i vec_as[KK / 2]; + __m256i vec_signs[KK / 8]; + #pragma unroll + for (int ai = 0; ai < KK / 2; ai++) { + vec_as[ai] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + ai * 32)); + } + #pragma unroll + for (int as = 0; as < KK / 8; as++) { + vec_signs[as] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(sign + i * KK / 8 + as * 32)); + } +#pragma unroll + for (int bs = 0; bs < batch_size; bs++) { + __m256i vec_c0 = _mm256_setzero_si256(); + __m256i vec_c1 = _mm256_setzero_si256(); +#pragma unroll + for (int k = 0; k < KK / 8; k++) { + __m256i vec_sign = vec_signs[k]; + __m256i vec_a_0 = vec_as[k * 4 + 0]; + __m128i vec_k1_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0)), 15); + __m256i vec_sign_left_lo_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 1)), 15); + __m256i vec_v_top_0 = _mm256_and_si256(_mm256_srli_epi16(vec_a_0, 4), vec_mask); + __m256i vec_v_top_fir_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_0, vec_k1_0), vec_v_top_0); + __m256i vec_v_top_sec_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_0, vec_k2_0), vec_v_top_0); + __m256i vec_sign_right_hi_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 2)), 15); + __m256i vec_sign_right_lo_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 3)), 15); + __m256i vec_v_bot_0 = _mm256_and_si256(vec_a_0, vec_mask); + __m256i vec_v_bot_fir_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_0, vec_k3_0), vec_v_bot_0); + __m256i vec_v_bot_sec_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_0, vec_k4_0), vec_v_bot_0); + __m256i vec_v_top_lo_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_0, vec_v_top_sec_0), vec_sign_left_lo_0), vec_sign_left_lo_0); + __m256i vec_v_top_hi_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_0, vec_v_top_sec_0), vec_sign_left_hi_0), vec_sign_left_hi_0); + __m256i vec_v_bot_lo_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_0, vec_v_bot_sec_0), vec_sign_right_lo_0), vec_sign_right_lo_0); + __m256i vec_v_bot_hi_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_0, vec_v_bot_sec_0), vec_sign_right_hi_0), vec_sign_right_hi_0); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_0); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_0); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_0); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_0); + __m256i vec_a_1 = vec_as[k * 4 + 1]; + __m128i vec_k1_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1)), 15); + __m256i vec_sign_left_lo_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 1)), 15); + __m256i vec_v_top_1 = _mm256_and_si256(_mm256_srli_epi16(vec_a_1, 4), vec_mask); + __m256i vec_v_top_fir_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_1, vec_k1_1), vec_v_top_1); + __m256i vec_v_top_sec_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_1, vec_k2_1), vec_v_top_1); + __m256i vec_sign_right_hi_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 2)), 15); + __m256i vec_sign_right_lo_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 3)), 15); + __m256i vec_v_bot_1 = _mm256_and_si256(vec_a_1, vec_mask); + __m256i vec_v_bot_fir_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_1, vec_k3_1), vec_v_bot_1); + __m256i vec_v_bot_sec_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_1, vec_k4_1), vec_v_bot_1); + __m256i vec_v_top_lo_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_1, vec_v_top_sec_1), vec_sign_left_lo_1), vec_sign_left_lo_1); + __m256i vec_v_top_hi_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_1, vec_v_top_sec_1), vec_sign_left_hi_1), vec_sign_left_hi_1); + __m256i vec_v_bot_lo_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_1, vec_v_bot_sec_1), vec_sign_right_lo_1), vec_sign_right_lo_1); + __m256i vec_v_bot_hi_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_1, vec_v_bot_sec_1), vec_sign_right_hi_1), vec_sign_right_hi_1); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_1); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_1); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_1); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_1); + __m256i vec_a_2 = vec_as[k * 4 + 2]; + __m128i vec_k1_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2)), 15); + __m256i vec_sign_left_lo_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 1)), 15); + __m256i vec_v_top_2 = _mm256_and_si256(_mm256_srli_epi16(vec_a_2, 4), vec_mask); + __m256i vec_v_top_fir_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_2, vec_k1_2), vec_v_top_2); + __m256i vec_v_top_sec_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_2, vec_k2_2), vec_v_top_2); + __m256i vec_sign_right_hi_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 2)), 15); + __m256i vec_sign_right_lo_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 3)), 15); + __m256i vec_v_bot_2 = _mm256_and_si256(vec_a_2, vec_mask); + __m256i vec_v_bot_fir_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_2, vec_k3_2), vec_v_bot_2); + __m256i vec_v_bot_sec_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_2, vec_k4_2), vec_v_bot_2); + __m256i vec_v_top_lo_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_2, vec_v_top_sec_2), vec_sign_left_lo_2), vec_sign_left_lo_2); + __m256i vec_v_top_hi_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_2, vec_v_top_sec_2), vec_sign_left_hi_2), vec_sign_left_hi_2); + __m256i vec_v_bot_lo_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_2, vec_v_bot_sec_2), vec_sign_right_lo_2), vec_sign_right_lo_2); + __m256i vec_v_bot_hi_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_2, vec_v_bot_sec_2), vec_sign_right_hi_2), vec_sign_right_hi_2); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_2); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_2); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_2); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_2); + __m256i vec_a_3 = vec_as[k * 4 + 3]; + __m128i vec_k1_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3)), 15); + __m256i vec_sign_left_lo_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 1)), 15); + __m256i vec_v_top_3 = _mm256_and_si256(_mm256_srli_epi16(vec_a_3, 4), vec_mask); + __m256i vec_v_top_fir_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_3, vec_k1_3), vec_v_top_3); + __m256i vec_v_top_sec_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_3, vec_k2_3), vec_v_top_3); + __m256i vec_sign_right_hi_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 2)), 15); + __m256i vec_sign_right_lo_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 3)), 15); + __m256i vec_v_bot_3 = _mm256_and_si256(vec_a_3, vec_mask); + __m256i vec_v_bot_fir_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_3, vec_k3_3), vec_v_bot_3); + __m256i vec_v_bot_sec_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_3, vec_k4_3), vec_v_bot_3); + __m256i vec_v_top_lo_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_3, vec_v_top_sec_3), vec_sign_left_lo_3), vec_sign_left_lo_3); + __m256i vec_v_top_hi_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_3, vec_v_top_sec_3), vec_sign_left_hi_3), vec_sign_left_hi_3); + __m256i vec_v_bot_lo_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_3, vec_v_bot_sec_3), vec_sign_right_lo_3), vec_sign_right_lo_3); + __m256i vec_v_bot_hi_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_3, vec_v_bot_sec_3), vec_sign_right_hi_3), vec_sign_right_hi_3); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_3); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_3); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_3); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_3); + } + __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_3200 * bs)); + __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_3200 * bs)); + __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_3200 * bs)); + __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_3200 * bs)); + vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0))); + vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1))); + vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1))); + vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1))); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_3200 * bs), vec_gc0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_3200 * bs), vec_gc1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_3200 * bs), vec_gc2); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_3200 * bs), vec_gc3); + } + } +#endif +} + +template +inline int32_t two_tbl_impl3200_3200(int32_t* c, int8_t* lut, uint8_t* a) { +#ifdef __AVX2__ + const __m256i vec_mask = _mm256_set1_epi8(0x0f); + const int KK = BK2 / 2; +#pragma unroll + for (int i = 0; i < BM3200_3200; i += 32) { + __m256i vec_as[KK / 2]; + #pragma unroll + for (int ai = 0; ai < KK / 2; ai++) { + vec_as[ai] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + ai * 32)); + } +#pragma unroll + for (int bs = 0; bs < batch_size; bs++) { + __m256i vec_c0 = _mm256_setzero_si256(); + __m256i vec_c1 = _mm256_setzero_si256(); +#pragma unroll + for (int k = 0; k < KK / 8; k++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + __m256i vec_a = vec_as[k * 4 + j]; + + __m128i vec_k1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 0 + K2 / 2 * 32 * bs)); + __m128i vec_k2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 16 + K2 / 2 * 32 * bs)); + __m128i vec_k3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 32 + K2 / 2 * 32 * bs)); + __m128i vec_k4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 48 + K2 / 2 * 32 * bs)); + + __m256i vec_v_top = _mm256_and_si256(_mm256_srli_epi16(vec_a, 4), vec_mask); + __m256i vec_v_top_fir = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1, vec_k1), vec_v_top); + __m256i vec_v_top_sec = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2, vec_k2), vec_v_top); + + __m256i vec_v_bot = _mm256_and_si256(vec_a, vec_mask); + __m256i vec_v_bot_fir = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3, vec_k3), vec_v_bot); + __m256i vec_v_bot_sec = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4, vec_k4), vec_v_bot); + + __m256i vec_v_top_lo = _mm256_unpackhi_epi8(vec_v_top_fir, vec_v_top_sec); + __m256i vec_v_top_hi = _mm256_unpacklo_epi8(vec_v_top_fir, vec_v_top_sec); + __m256i vec_v_bot_lo = _mm256_unpackhi_epi8(vec_v_bot_fir, vec_v_bot_sec); + __m256i vec_v_bot_hi = _mm256_unpacklo_epi8(vec_v_bot_fir, vec_v_bot_sec); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo); + } + } + + __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_3200 * bs)); + __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_3200 * bs)); + __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_3200 * bs)); + __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_3200 * bs)); + + vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0))); + vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1))); + vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1))); + vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1))); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + BM3200_3200 * bs), vec_gc0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM3200_3200 * bs), vec_gc1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM3200_3200 * bs), vec_gc2); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM3200_3200 * bs), vec_gc3); + } + } +#endif + return 0; +} + +template +int32_t three_qgemm_lut_3200_3200(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) { + alignas(32) uint32_t CBits[BATCH_SIZE * BM3200_3200]; + memset(&(CBits[0]), 0, BATCH_SIZE * BM3200_3200 * sizeof(int32_t)); +#pragma unroll + for (int32_t k_outer = 0; k_outer < 3168 / BBK3200_3200; ++k_outer) { + three_tbl_impl_3200_3200((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK3200_3200 / 3 * 32)])), (&(((uint8_t*)A)[(k_outer * BBK3200_3200 / 3 / 2 * BM3200_3200)])), (&(((uint8_t*)sign)[(k_outer * BBK3200_3200 / 3 / 8 * BM3200_3200)]))); + } +#pragma unroll + for (int bs = 0; bs < BATCH_SIZE; bs++) { +#pragma unroll + for (int i = 0; i < BM3200_3200; i++) { + ((int32_t*)C)[i] = (int32_t)(((int32_t*)CBits)[i + bs * BM3200_3200]); + } + } + return 0; +} + +template +int32_t two_qgemm_lut_3200_3200(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) { + alignas(32) uint32_t CBits[BATCH_SIZE * BM3200_3200]; + memset(&(CBits[0]), 0, BATCH_SIZE * BM3200_3200 * sizeof(int32_t)); +#pragma unroll + for (int32_t k_outer = 0; k_outer < 32 / 32; ++k_outer) { + two_tbl_impl3200_3200((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BK2 / 2 * 32)])), (&(((uint8_t*)A)[(k_outer * BK2 / 2 / 2 * BM3200_3200)]))); + } +#pragma unroll + for (int bs = 0; bs < BATCH_SIZE; bs++) { +#pragma unroll + for (int i = 0; i < BM3200_3200; i++) { + ((int32_t*)C)[i] += (int32_t)(((int32_t*)CBits)[i + bs * BM3200_3200]); + ((float*)C)[i] = (float)(((int32_t*)C)[i]) / ((float*)LUT_Scales)[bs] * ((float*)Scales)[0]; + } + } + return 0; +} + +#include + +#define BM8640_3200 320 +#define BBK8640_3200 96 +template +inline void three_tbl_impl_8640_3200(int32_t* c, int8_t* lut, uint8_t* a, uint8_t* sign) { +#ifdef __AVX2__ + const __m256i vec_mask = _mm256_set1_epi8(0x0f); + const __m256i vec_sign_mask = _mm256_set1_epi16(0x8000); + const __m256i vec_zero = _mm256_set1_epi8(0x00); + const __m256i vec_one = _mm256_set1_epi8(0xff); + const int KK = BBK8640_3200 / 3; +#pragma unroll + for (int i = 0; i < BM8640_3200; i += 32) { + __m256i vec_as[KK / 2]; + __m256i vec_signs[KK / 8]; + #pragma unroll + for (int ai = 0; ai < KK / 2; ai++) { + vec_as[ai] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + ai * 32)); + } + #pragma unroll + for (int as = 0; as < KK / 8; as++) { + vec_signs[as] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(sign + i * KK / 8 + as * 32)); + } +#pragma unroll + for (int bs = 0; bs < batch_size; bs++) { + __m256i vec_c0 = _mm256_setzero_si256(); + __m256i vec_c1 = _mm256_setzero_si256(); +#pragma unroll + for (int k = 0; k < KK / 8; k++) { + __m256i vec_sign = vec_signs[k]; + __m256i vec_a_0 = vec_as[k * 4 + 0]; + __m128i vec_k1_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 0 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0)), 15); + __m256i vec_sign_left_lo_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 1)), 15); + __m256i vec_v_top_0 = _mm256_and_si256(_mm256_srli_epi16(vec_a_0, 4), vec_mask); + __m256i vec_v_top_fir_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_0, vec_k1_0), vec_v_top_0); + __m256i vec_v_top_sec_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_0, vec_k2_0), vec_v_top_0); + __m256i vec_sign_right_hi_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 2)), 15); + __m256i vec_sign_right_lo_0 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 0 + 3)), 15); + __m256i vec_v_bot_0 = _mm256_and_si256(vec_a_0, vec_mask); + __m256i vec_v_bot_fir_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_0, vec_k3_0), vec_v_bot_0); + __m256i vec_v_bot_sec_0 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_0, vec_k4_0), vec_v_bot_0); + __m256i vec_v_top_lo_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_0, vec_v_top_sec_0), vec_sign_left_lo_0), vec_sign_left_lo_0); + __m256i vec_v_top_hi_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_0, vec_v_top_sec_0), vec_sign_left_hi_0), vec_sign_left_hi_0); + __m256i vec_v_bot_lo_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_0, vec_v_bot_sec_0), vec_sign_right_lo_0), vec_sign_right_lo_0); + __m256i vec_v_bot_hi_0 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_0, vec_v_bot_sec_0), vec_sign_right_hi_0), vec_sign_right_hi_0); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_0); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_0); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_0); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_0); + __m256i vec_a_1 = vec_as[k * 4 + 1]; + __m128i vec_k1_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 1 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1)), 15); + __m256i vec_sign_left_lo_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 1)), 15); + __m256i vec_v_top_1 = _mm256_and_si256(_mm256_srli_epi16(vec_a_1, 4), vec_mask); + __m256i vec_v_top_fir_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_1, vec_k1_1), vec_v_top_1); + __m256i vec_v_top_sec_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_1, vec_k2_1), vec_v_top_1); + __m256i vec_sign_right_hi_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 2)), 15); + __m256i vec_sign_right_lo_1 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 1 + 3)), 15); + __m256i vec_v_bot_1 = _mm256_and_si256(vec_a_1, vec_mask); + __m256i vec_v_bot_fir_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_1, vec_k3_1), vec_v_bot_1); + __m256i vec_v_bot_sec_1 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_1, vec_k4_1), vec_v_bot_1); + __m256i vec_v_top_lo_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_1, vec_v_top_sec_1), vec_sign_left_lo_1), vec_sign_left_lo_1); + __m256i vec_v_top_hi_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_1, vec_v_top_sec_1), vec_sign_left_hi_1), vec_sign_left_hi_1); + __m256i vec_v_bot_lo_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_1, vec_v_bot_sec_1), vec_sign_right_lo_1), vec_sign_right_lo_1); + __m256i vec_v_bot_hi_1 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_1, vec_v_bot_sec_1), vec_sign_right_hi_1), vec_sign_right_hi_1); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_1); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_1); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_1); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_1); + __m256i vec_a_2 = vec_as[k * 4 + 2]; + __m128i vec_k1_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 2 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2)), 15); + __m256i vec_sign_left_lo_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 1)), 15); + __m256i vec_v_top_2 = _mm256_and_si256(_mm256_srli_epi16(vec_a_2, 4), vec_mask); + __m256i vec_v_top_fir_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_2, vec_k1_2), vec_v_top_2); + __m256i vec_v_top_sec_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_2, vec_k2_2), vec_v_top_2); + __m256i vec_sign_right_hi_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 2)), 15); + __m256i vec_sign_right_lo_2 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 2 + 3)), 15); + __m256i vec_v_bot_2 = _mm256_and_si256(vec_a_2, vec_mask); + __m256i vec_v_bot_fir_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_2, vec_k3_2), vec_v_bot_2); + __m256i vec_v_bot_sec_2 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_2, vec_k4_2), vec_v_bot_2); + __m256i vec_v_top_lo_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_2, vec_v_top_sec_2), vec_sign_left_lo_2), vec_sign_left_lo_2); + __m256i vec_v_top_hi_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_2, vec_v_top_sec_2), vec_sign_left_hi_2), vec_sign_left_hi_2); + __m256i vec_v_bot_lo_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_2, vec_v_bot_sec_2), vec_sign_right_lo_2), vec_sign_right_lo_2); + __m256i vec_v_bot_hi_2 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_2, vec_v_bot_sec_2), vec_sign_right_hi_2), vec_sign_right_hi_2); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_2); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_2); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_2); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_2); + __m256i vec_a_3 = vec_as[k * 4 + 3]; + __m128i vec_k1_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 0 + K3 / 3 * 32 * bs)); + __m128i vec_k2_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 16 + K3 / 3 * 32 * bs)); + __m128i vec_k3_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 32 + K3 / 3 * 32 * bs)); + __m128i vec_k4_3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + 3 * 64 + 48 + K3 / 3 * 32 * bs)); + __m256i vec_sign_left_hi_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3)), 15); + __m256i vec_sign_left_lo_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 1)), 15); + __m256i vec_v_top_3 = _mm256_and_si256(_mm256_srli_epi16(vec_a_3, 4), vec_mask); + __m256i vec_v_top_fir_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1_3, vec_k1_3), vec_v_top_3); + __m256i vec_v_top_sec_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2_3, vec_k2_3), vec_v_top_3); + __m256i vec_sign_right_hi_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 2)), 15); + __m256i vec_sign_right_lo_3 = _mm256_srai_epi16(_mm256_slli_epi16(vec_sign, (4 * 3 + 3)), 15); + __m256i vec_v_bot_3 = _mm256_and_si256(vec_a_3, vec_mask); + __m256i vec_v_bot_fir_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3_3, vec_k3_3), vec_v_bot_3); + __m256i vec_v_bot_sec_3 = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4_3, vec_k4_3), vec_v_bot_3); + __m256i vec_v_top_lo_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_top_fir_3, vec_v_top_sec_3), vec_sign_left_lo_3), vec_sign_left_lo_3); + __m256i vec_v_top_hi_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_top_fir_3, vec_v_top_sec_3), vec_sign_left_hi_3), vec_sign_left_hi_3); + __m256i vec_v_bot_lo_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpackhi_epi8(vec_v_bot_fir_3, vec_v_bot_sec_3), vec_sign_right_lo_3), vec_sign_right_lo_3); + __m256i vec_v_bot_hi_3 = _mm256_xor_si256(_mm256_add_epi16(_mm256_unpacklo_epi8(vec_v_bot_fir_3, vec_v_bot_sec_3), vec_sign_right_hi_3), vec_sign_right_hi_3); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi_3); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi_3); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo_3); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo_3); + } + __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + BM8640_3200 * bs)); + __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM8640_3200 * bs)); + __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM8640_3200 * bs)); + __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM8640_3200 * bs)); + vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0))); + vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1))); + vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1))); + vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1))); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + BM8640_3200 * bs), vec_gc0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM8640_3200 * bs), vec_gc1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM8640_3200 * bs), vec_gc2); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM8640_3200 * bs), vec_gc3); + } + } +#endif +} + +template +inline int32_t two_tbl_impl8640_3200(int32_t* c, int8_t* lut, uint8_t* a) { +#ifdef __AVX2__ + const __m256i vec_mask = _mm256_set1_epi8(0x0f); + const int KK = BK2 / 2; +#pragma unroll + for (int i = 0; i < BM8640_3200; i += 32) { + __m256i vec_as[KK / 2]; + #pragma unroll + for (int ai = 0; ai < KK / 2; ai++) { + vec_as[ai] = _mm256_loadu_si256(reinterpret_cast<__m256i*>(a + i * KK / 2 + ai * 32)); + } +#pragma unroll + for (int bs = 0; bs < batch_size; bs++) { + __m256i vec_c0 = _mm256_setzero_si256(); + __m256i vec_c1 = _mm256_setzero_si256(); +#pragma unroll + for (int k = 0; k < KK / 8; k++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + __m256i vec_a = vec_as[k * 4 + j]; + + __m128i vec_k1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 0 + K2 / 2 * 32 * bs)); + __m128i vec_k2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 16 + K2 / 2 * 32 * bs)); + __m128i vec_k3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 32 + K2 / 2 * 32 * bs)); + __m128i vec_k4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(lut + k * 32 * 8 + j * 64 + 48 + K2 / 2 * 32 * bs)); + + __m256i vec_v_top = _mm256_and_si256(_mm256_srli_epi16(vec_a, 4), vec_mask); + __m256i vec_v_top_fir = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k1, vec_k1), vec_v_top); + __m256i vec_v_top_sec = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k2, vec_k2), vec_v_top); + + __m256i vec_v_bot = _mm256_and_si256(vec_a, vec_mask); + __m256i vec_v_bot_fir = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k3, vec_k3), vec_v_bot); + __m256i vec_v_bot_sec = _mm256_shuffle_epi8(_mm256_set_m128i(vec_k4, vec_k4), vec_v_bot); + + __m256i vec_v_top_lo = _mm256_unpackhi_epi8(vec_v_top_fir, vec_v_top_sec); + __m256i vec_v_top_hi = _mm256_unpacklo_epi8(vec_v_top_fir, vec_v_top_sec); + __m256i vec_v_bot_lo = _mm256_unpackhi_epi8(vec_v_bot_fir, vec_v_bot_sec); + __m256i vec_v_bot_hi = _mm256_unpacklo_epi8(vec_v_bot_fir, vec_v_bot_sec); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_top_hi); + vec_c0 = _mm256_add_epi16(vec_c0, vec_v_bot_hi); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_top_lo); + vec_c1 = _mm256_add_epi16(vec_c1, vec_v_bot_lo); + } + } + + __m256i vec_gc0 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + BM8640_3200 * bs)); + __m256i vec_gc1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM8640_3200 * bs)); + __m256i vec_gc2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM8640_3200 * bs)); + __m256i vec_gc3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM8640_3200 * bs)); + + vec_gc0 = _mm256_add_epi32(vec_gc0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c0))); + vec_gc1 = _mm256_add_epi32(vec_gc1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c0, 1))); + vec_gc2 = _mm256_add_epi32(vec_gc2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(vec_c1))); + vec_gc3 = _mm256_add_epi32(vec_gc3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(vec_c1, 1))); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + BM8640_3200 * bs), vec_gc0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 8 + BM8640_3200 * bs), vec_gc1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 16 + BM8640_3200 * bs), vec_gc2); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(c + i + 24 + BM8640_3200 * bs), vec_gc3); + } + } +#endif + return 0; +} + +template +int32_t three_qgemm_lut_8640_3200(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) { + alignas(32) uint32_t CBits[BATCH_SIZE * BM8640_3200]; + memset(&(CBits[0]), 0, BATCH_SIZE * BM8640_3200 * sizeof(int32_t)); +#pragma unroll + for (int32_t k_outer = 0; k_outer < 3168 / BBK8640_3200; ++k_outer) { + three_tbl_impl_8640_3200((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BBK8640_3200 / 3 * 32)])), (&(((uint8_t*)A)[(k_outer * BBK8640_3200 / 3 / 2 * BM8640_3200)])), (&(((uint8_t*)sign)[(k_outer * BBK8640_3200 / 3 / 8 * BM8640_3200)]))); + } +#pragma unroll + for (int bs = 0; bs < BATCH_SIZE; bs++) { +#pragma unroll + for (int i = 0; i < BM8640_3200; i++) { + ((int32_t*)C)[i] = (int32_t)(((int32_t*)CBits)[i + bs * BM8640_3200]); + } + } + return 0; +} + +template +int32_t two_qgemm_lut_8640_3200(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) { + alignas(32) uint32_t CBits[BATCH_SIZE * BM8640_3200]; + memset(&(CBits[0]), 0, BATCH_SIZE * BM8640_3200 * sizeof(int32_t)); +#pragma unroll + for (int32_t k_outer = 0; k_outer < 32 / 32; ++k_outer) { + two_tbl_impl8640_3200((&(((int32_t*)CBits)[0])), (&(((int8_t*)LUT)[(k_outer * BK2 / 2 * 32)])), (&(((uint8_t*)A)[(k_outer * BK2 / 2 / 2 * BM8640_3200)]))); + } +#pragma unroll + for (int bs = 0; bs < BATCH_SIZE; bs++) { +#pragma unroll + for (int i = 0; i < BM8640_3200; i++) { + ((int32_t*)C)[i] += (int32_t)(((int32_t*)CBits)[i + bs * BM8640_3200]); + ((float*)C)[i] = (float)(((int32_t*)C)[i]) / ((float*)LUT_Scales)[bs] * ((float*)Scales)[0]; + } + } + return 0; +} + +void ggml_preprocessor(int bs, int m, int three_k, int two_k, void* B, void* LUT_Scales, void* Three_QLUT, void* Two_QLUT) { + partial_max_reset(bs, (&(((float*)LUT_Scales)[0]))); + if (m == 3200 && two_k == 0 && three_k == 8640) { + for (int32_t b = 0; b < bs; b++) { + per_tensor_quant(two_k + three_k, (&(((float*)LUT_Scales)[b])), (&(((float*)B)[b * (two_k + three_k)]))); + three_lut_ctor<8640>((&(((int8_t*)Three_QLUT)[b * three_k / 3 * 32])), (&(((float*)B)[b * (three_k + two_k)])), (&(((float*)LUT_Scales)[b]))); + two_lut_ctor<0>((&(((int8_t*)Two_QLUT)[b * two_k / 2 * 32])), (&(((float*)B)[b * (three_k + two_k) + 8640])), (&(((float*)LUT_Scales)[b]))); + } + } + else if (m == 3200 && two_k == 32 && three_k == 3168) { + for (int32_t b = 0; b < bs; b++) { + per_tensor_quant(two_k + three_k, (&(((float*)LUT_Scales)[b])), (&(((float*)B)[b * (two_k + three_k)]))); + three_lut_ctor<3168>((&(((int8_t*)Three_QLUT)[b * three_k / 3 * 32])), (&(((float*)B)[b * (three_k + two_k)])), (&(((float*)LUT_Scales)[b]))); + two_lut_ctor<32>((&(((int8_t*)Two_QLUT)[b * two_k / 2 * 32])), (&(((float*)B)[b * (three_k + two_k) + 3168])), (&(((float*)LUT_Scales)[b]))); + } + } + else if (m == 8640 && two_k == 32 && three_k == 3168) { + for (int32_t b = 0; b < bs; b++) { + per_tensor_quant(two_k + three_k, (&(((float*)LUT_Scales)[b])), (&(((float*)B)[b * (two_k + three_k)]))); + three_lut_ctor<3168>((&(((int8_t*)Three_QLUT)[b * three_k / 3 * 32])), (&(((float*)B)[b * (three_k + two_k)])), (&(((float*)LUT_Scales)[b]))); + two_lut_ctor<32>((&(((int8_t*)Two_QLUT)[b * two_k / 2 * 32])), (&(((float*)B)[b * (three_k + two_k) + 3168])), (&(((float*)LUT_Scales)[b]))); + } + } +} +void ggml_qgemm_lut(int bs, int m, int k, int BK, void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) { + if (m == 3200 && k == 8640) { + if (BK == 0) { + if (bs == 1) { + two_qgemm_lut_3200_8640<1>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 8) { + two_qgemm_lut_3200_8640<8>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 32) { + two_qgemm_lut_3200_8640<32>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 128) { + two_qgemm_lut_3200_8640<128>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 256) { + two_qgemm_lut_3200_8640<256>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 512) { + two_qgemm_lut_3200_8640<512>(A, LUT, Scales, LUT_Scales, C); + } + } + else if (BK == 8640) { + if (bs == 1) { + three_qgemm_lut_3200_8640<1>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 8) { + three_qgemm_lut_3200_8640<8>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 32) { + three_qgemm_lut_3200_8640<32>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 128) { + three_qgemm_lut_3200_8640<128>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 256) { + three_qgemm_lut_3200_8640<256>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 512) { + three_qgemm_lut_3200_8640<512>(A, sign, LUT, Scales, LUT_Scales, C); + } + } + } + else if (m == 3200 && k == 3200) { + if (BK == 32) { + if (bs == 1) { + two_qgemm_lut_3200_3200<1>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 8) { + two_qgemm_lut_3200_3200<8>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 32) { + two_qgemm_lut_3200_3200<32>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 128) { + two_qgemm_lut_3200_3200<128>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 256) { + two_qgemm_lut_3200_3200<256>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 512) { + two_qgemm_lut_3200_3200<512>(A, LUT, Scales, LUT_Scales, C); + } + } + else if (BK == 3168) { + if (bs == 1) { + three_qgemm_lut_3200_3200<1>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 8) { + three_qgemm_lut_3200_3200<8>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 32) { + three_qgemm_lut_3200_3200<32>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 128) { + three_qgemm_lut_3200_3200<128>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 256) { + three_qgemm_lut_3200_3200<256>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 512) { + three_qgemm_lut_3200_3200<512>(A, sign, LUT, Scales, LUT_Scales, C); + } + } + } + else if (m == 8640 && k == 3200) { + if (BK == 32) { + if (bs == 1) { + two_qgemm_lut_8640_3200<1>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 8) { + two_qgemm_lut_8640_3200<8>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 32) { + two_qgemm_lut_8640_3200<32>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 128) { + two_qgemm_lut_8640_3200<128>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 256) { + two_qgemm_lut_8640_3200<256>(A, LUT, Scales, LUT_Scales, C); + } else if (bs == 512) { + two_qgemm_lut_8640_3200<512>(A, LUT, Scales, LUT_Scales, C); + } + } + else if (BK == 3168) { + if (bs == 1) { + three_qgemm_lut_8640_3200<1>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 8) { + three_qgemm_lut_8640_3200<8>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 32) { + three_qgemm_lut_8640_3200<32>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 128) { + three_qgemm_lut_8640_3200<128>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 256) { + three_qgemm_lut_8640_3200<256>(A, sign, LUT, Scales, LUT_Scales, C); + }else if (bs == 512) { + three_qgemm_lut_8640_3200<512>(A, sign, LUT, Scales, LUT_Scales, C); + } + } + } +} + +void ggml_bitnet_transform_tensor(struct ggml_tensor * tensor) { + if (!(is_type_supported(tensor->type) && tensor->backend == GGML_BACKEND_TYPE_CPU && tensor->extra == nullptr)) { + return; + } + + int k = tensor->ne[0]; + int m = tensor->ne[1]; + const int lut_scales_size = 1; + int bk = 0; + int bm = 0; + + if (m == 3200 && k == 8640) { + bm = BM3200_8640; + bk = BBK3200_8640; + } +else if (m == 3200 && k == 3200) { + bm = BM3200_3200; + bk = BBK3200_3200; + } +else if (m == 8640 && k == 3200) { + bm = BM8640_3200; + bk = BBK8640_3200; + } + + const int n_tile_num = m / bm; + const int BK = bk; + uint8_t * qweights; + bitnet_float_type * scales; + + scales = (bitnet_float_type *) aligned_malloc(sizeof(bitnet_float_type)); + qweights = (uint8_t *) tensor->data; + float * i2_scales = (float * )(qweights + k * m / 4); + scales[0] = (bitnet_float_type) i2_scales[0]; + + tensor->extra = bitnet_tensor_extras + bitnet_tensor_extras_index; + bitnet_tensor_extras[bitnet_tensor_extras_index++] = { + /* .lut_scales_size = */ lut_scales_size, + /* .BK = */ BK, + /* .n_tile_num = */ n_tile_num, + /* .qweights = */ qweights, + /* .scales = */ scales + }; +} +#endif \ No newline at end of file diff --git a/scripts/benchmark_bitnet_embeddings_0.6b.sh b/scripts/benchmark_bitnet_embeddings_0.6b.sh new file mode 100755 index 000000000..d6e8542f9 --- /dev/null +++ b/scripts/benchmark_bitnet_embeddings_0.6b.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# bitnet-embeddings-0.6b (Qwen3) CPU Benchmark: F16 vs I2_S vs I2_S+Q6K_embd +# +# Usage: cd /home/huangxin/code_list/BitNet && bash scripts/benchmark_bitnet_embeddings_0.6b.sh [threads] +# Examples: +# bash scripts/benchmark_bitnet_embeddings_0.6b.sh 4 +# bash scripts/benchmark_bitnet_embeddings_0.6b.sh 8 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +BENCH="${PROJECT_DIR}/build/bin/llama-bench" +THREADS=${1:-4} + +# --- Model paths --- +BASE_DIR="/data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b" +# GGUF_F16="${BASE_DIR}/bitnet-embeddings-0.6b-f16.gguf" +# GGUF_I2S="${BASE_DIR}/bitnet-embeddings-0.6b-f16-i2_s.gguf" +GGUF_F16="${BASE_DIR}/bitnet-embeddings-0.6b-f16-new.gguf" +GGUF_I2S="${BASE_DIR}/bitnet-embeddings-0.6b-f16-new-i2_s.gguf" +# GGUF_I2S_Q6K="${BASE_DIR}/bitnet-embeddings-0.6b-f16-i2_s-q6k_embed.gguf" + +# Common bench args: CPU only, 3 repetitions +# pp (prefill) + tg (decode/generate) +BENCH_ARGS="-t $THREADS -p 128,256,512,1024,2048 -n 128,256 -r 3 -ngl 0" + +echo "========================================================" +echo " bitnet-embeddings-0.6b (Qwen3) CPU Benchmark" +echo " Threads: $THREADS" +echo " Prompt lengths (pp): 128, 256, 512, 1024, 2048" +echo " Generate lengths (tg): 128, 256" +echo " Repetitions: 3" +echo "========================================================" +echo + +# --- Model sizes --- +echo "=== Model Sizes ===" +echo "--- bitnet-embeddings-0.6b (qwen3) ---" +[ -f "$GGUF_F16" ] && echo " F16: $(du -h "$GGUF_F16" | cut -f1)" +[ -f "$GGUF_I2S" ] && echo " I2_S: $(du -h "$GGUF_I2S" | cut -f1)" +[ -f "$GGUF_I2S_Q6K" ] && echo " I2_S+Q6K: $(du -h "$GGUF_I2S_Q6K" | cut -f1)" +echo + +# ============================================================ +# bitnet-embeddings-0.6b (qwen3) +# ============================================================ +echo "========================================================" +echo " bitnet-embeddings-0.6b (qwen3, hidden=1024, intermediate=3072)" +echo "========================================================" + +echo +echo "--- [bitnet-embeddings-0.6b] F16 ---" +[ -f "$GGUF_F16" ] && $BENCH -m "$GGUF_F16" $BENCH_ARGS || echo "⚠ Model not found: $GGUF_F16" + +echo +echo "--- [bitnet-embeddings-0.6b] I2_S ---" +[ -f "$GGUF_I2S" ] && $BENCH -m "$GGUF_I2S" $BENCH_ARGS || echo "⚠ Model not found: $GGUF_I2S" + +# echo +# echo "--- [bitnet-embeddings-0.6b] I2_S + Q6K embedding ---" +# [ -f "$GGUF_I2S_Q6K" ] && $BENCH -m "$GGUF_I2S_Q6K" $BENCH_ARGS || echo "⚠ Model not found: $GGUF_I2S_Q6K" + +echo +echo "========================================================" +echo " Done. All benchmarks completed." +echo "========================================================" diff --git a/scripts/eval_mmteb_v2.py b/scripts/eval_mmteb_v2.py new file mode 100644 index 000000000..8e6c9c785 --- /dev/null +++ b/scripts/eval_mmteb_v2.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +""" +Evaluate bitnet-embeddings-0.6b on MTEB multilingual v2 benchmark. + +Supports 3 model formats: + - safetensors: PyTorch GPU inference (baseline) + - f16: F16 GGUF via llama-embedding (CPU) + - i2s: I2_S GGUF via llama-embedding (CPU) + +Usage: + # Run all 3 models + python scripts/eval_mmteb_v2.py \ + --model-dir /data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b \ + --build-dir /home/huangxin/code_list/BitNet/build \ + --output-dir /home/huangxin/code_list/BitNet/eval_results + + # Run specific model type only + python scripts/eval_mmteb_v2.py --model-dir ... --model-type i2s + + # Dry run (1 task) + python scripts/eval_mmteb_v2.py --model-dir ... --dry-run +""" + +import os +import sys +import json +import argparse +import logging +import tempfile +import subprocess + +# Parse --gpu early so CUDA_VISIBLE_DEVICES is set before torch import +_gpu = "0,1" +for i, arg in enumerate(sys.argv): + if arg == "--gpu" and i + 1 < len(sys.argv): + _gpu = sys.argv[i + 1] +os.environ["CUDA_VISIBLE_DEVICES"] = _gpu + +import numpy as np +import torch +import torch.nn.functional as F + +from typing import List, Optional +from pathlib import Path +from torch.utils.data import DataLoader + +# Add bitnet-embeddings-v260420/src to path for SimpleEncoder and eval_instructions +BITNET_EMBED_SRC = "/home/huangxin/code_list/bitnet-embeddings-v260420/src" +sys.path.insert(0, BITNET_EMBED_SRC) + +import mteb +from mteb import TaskMetadata, EncoderProtocol, BenchmarkResults +from mteb.benchmarks.benchmarks import MTEB_multilingual_v2 +from mteb.types import PromptType, BatchedInput, Array +from mteb.models.model_meta import ScoringFunction, ModelMeta +from mteb.results import ModelResult + +from inference.eval_instructions import get_instruct_for_eval + +logging.basicConfig( + format="[%(asctime)s %(levelname)s] %(message)s", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + +# ── Selected tasks (10 small/fast tasks across different types, ~1h for 3 models) ── + +SELECTED_TASKS = [ + # BitextMining (1) + "BornholmBitextMining", + # Classification (3) + "PoemSentimentClassification", + "KorSarcasmClassification", + "FinancialPhrasebankClassification", + # STS (3) + "SICK-R", + "STSBenchmark", + "STS17", + # PairClassification (1) + #"SprintDuplicateQuestions", + # MultilabelClassification (1) + "KorHateSpeechMLClassification", + # Reranking (1) + #"T2Reranking", +] + + + +# ── LlamaCpp Encoder (for GGUF models) ─────────────────────────────────────── + +class LlamaCppEncoder: + """Encode texts using llama-embedding binary.""" + + def __init__(self, gguf_path: str, build_dir: str, threads: int = 8, + ctx_size: int = 2048, batch_size: int = 64): + self.gguf_path = gguf_path + self.embd_bin = str(Path(build_dir) / "bin" / "llama-embedding") + self.threads = threads + self.ctx_size = ctx_size + self.batch_size = batch_size # texts per subprocess call + + if not Path(self.embd_bin).exists(): + raise FileNotFoundError(f"llama-embedding not found at {self.embd_bin}") + + def encode(self, sentences: List[str]) -> np.ndarray: + """Encode sentences in batches via llama-embedding.""" + all_embeddings = [] + + for i in range(0, len(sentences), self.batch_size): + batch = sentences[i:i + self.batch_size] + embeddings = self._encode_batch(batch) + if embeddings is None: + raise RuntimeError(f"llama-embedding failed on batch {i//self.batch_size}") + all_embeddings.append(embeddings) + + return np.concatenate(all_embeddings, axis=0) + + def _encode_batch(self, texts: List[str]) -> Optional[np.ndarray]: + """Run llama-embedding on a batch of texts using file input.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: + # Write texts separated by a unique separator + sep = "<#SEP#>" + # Replace any occurrence of separator in texts to avoid collision + clean_texts = [t.replace(sep, " ") for t in texts] + f.write(sep.join(clean_texts)) + tmp_path = f.name + + try: + result = subprocess.run( + [self.embd_bin, + "-m", self.gguf_path, + "-f", tmp_path, + "--embd-normalize", "2", + "--embd-output-format", "json", + "--embd-separator", sep, + "--pooling", "last", + "-c", str(self.ctx_size), + "-t", str(self.threads)], + capture_output=True, text=True, timeout=600 + ) + + if result.returncode != 0: + logger.error(f"llama-embedding failed (rc={result.returncode}): {result.stderr[:500]}") + return None + + stdout = result.stdout.strip() + if not stdout: + logger.error(f"llama-embedding returned empty stdout. stderr: {result.stderr[:500]}") + return None + + data = json.loads(stdout) + if isinstance(data, dict) and "data" in data: + embeddings = [item["embedding"] for item in data["data"]] + elif isinstance(data, list): + embeddings = data + else: + logger.error(f"Unexpected output format: {type(data)}") + return None + + return np.array(embeddings, dtype=np.float32) + finally: + os.unlink(tmp_path) + + +# ── Custom Model wrappers for MTEB ─────────────────────────────────────────── + +class SafetensorsModel: + """MTEB model wrapper using PyTorch SimpleEncoder. + + Args: + disable_quant: If True, disable BitLinear activation/weight quantization + (for comparing with F16 GGUF which uses full-precision weights). + If False (default), use full BitLinear quantization + (for comparing with I2_S GGUF which uses ternary weights). + """ + + def __init__(self, model_name: str, pool_type: str = "last", + instruct_type: str = "detailed", disable_quant: bool = False, + **kwargs): + from search import simple_encoder as se_module + from search.simple_encoder import SimpleEncoder + self.model_name_or_path = model_name + self.instruct_type = instruct_type + + # Patch: if flash_attn not available, force sdpa + try: + import flash_attn # noqa: F401 + except ImportError: + _orig_from_pretrained = se_module.AutoModel.from_pretrained + def _patched_from_pretrained(*args, **kwargs): + if kwargs.get('attn_implementation') == 'flash_attention_2': + kwargs['attn_implementation'] = 'sdpa' + return _orig_from_pretrained(*args, **kwargs) + se_module.AutoModel.from_pretrained = _patched_from_pretrained + + self.encoder = SimpleEncoder( + model_name_or_path=model_name, + pool_type=pool_type, + l2_normalize=True, + use_bitnet=True, + ) + + if disable_quant: + # Disable activation_quant and weight_quant in BitLinear forward + # so PyTorch uses full-precision weights (matching F16 GGUF behavior) + from bitnet import BitLinear + def no_quant_forward(self, x): + w = self.weight + x_norm = self.norm(x) if self.should_rms else x + return torch.nn.functional.linear(x_norm, w, self.bias) + BitLinear.forward = no_quant_forward + logger.info("BitLinear quantization DISABLED (no-quant mode for F16 comparison)") + + def encode(self, inputs: DataLoader[BatchedInput], task_metadata: TaskMetadata, + hf_split: str, hf_subset: str, prompt_type: PromptType | None = None, + **kwargs) -> np.ndarray: + instruct = get_instruct_for_eval( + task_name=task_metadata.name, task_type=task_metadata.type, + instruct_type=self.instruct_type, prompt_type=prompt_type, + ) + + input_texts = [] + for batch in inputs: + input_texts.extend(batch['text']) + input_texts = [instruct + text for text in input_texts] + + embeds = self.encoder.encode(input_texts) + if task_metadata.type in ['BitextMining', 'Retrieval', 'Reranking']: + embeds = torch.tensor(embeds, device='cuda', dtype=torch.float16) + return embeds + + def similarity(self, e1: Array, e2: Array) -> Array: + if isinstance(e1, np.ndarray): + e1 = torch.tensor(e1, device='cuda', dtype=torch.float16) + e2 = torch.tensor(e2, device='cuda', dtype=torch.float16) + return torch.matmul(e1, e2.T) + + def similarity_pairwise(self, e1: Array, e2: Array) -> Array: + if isinstance(e1, np.ndarray): + e1 = torch.tensor(e1, device='cuda', dtype=torch.float16) + e2 = torch.tensor(e2, device='cuda', dtype=torch.float16) + return torch.sum(e1 * e2, dim=1) + + +class GGUFModel: + """MTEB model wrapper using llama-embedding binary.""" + + def __init__(self, gguf_path: str, build_dir: str, instruct_type: str = "detailed", + threads: int = 8, **kwargs): + self.model_name_or_path = gguf_path + self.instruct_type = instruct_type + self.cpp_encoder = LlamaCppEncoder( + gguf_path=gguf_path, build_dir=build_dir, threads=threads, + ) + + def encode(self, inputs: DataLoader[BatchedInput], task_metadata: TaskMetadata, + hf_split: str, hf_subset: str, prompt_type: PromptType | None = None, + **kwargs) -> np.ndarray: + instruct = get_instruct_for_eval( + task_name=task_metadata.name, task_type=task_metadata.type, + instruct_type=self.instruct_type, prompt_type=prompt_type, + ) + + input_texts = [] + for batch in inputs: + input_texts.extend(batch['text']) + input_texts = [instruct + text for text in input_texts] + + embeds = self.cpp_encoder.encode(input_texts) + if task_metadata.type in ['BitextMining', 'Retrieval', 'Reranking']: + embeds = torch.tensor(embeds, device='cuda', dtype=torch.float16) + return embeds + + def similarity(self, e1: Array, e2: Array) -> Array: + if isinstance(e1, np.ndarray): + e1 = torch.tensor(e1, device='cuda', dtype=torch.float16) + e2 = torch.tensor(e2, device='cuda', dtype=torch.float16) + return torch.matmul(e1, e2.T) + + def similarity_pairwise(self, e1: Array, e2: Array) -> Array: + if isinstance(e1, np.ndarray): + e1 = torch.tensor(e1, device='cuda', dtype=torch.float16) + e2 = torch.tensor(e2, device='cuda', dtype=torch.float16) + return torch.sum(e1 * e2, dim=1) + + +# ── Evaluation runner ──────────────────────────────────────────────────────── + +def build_mteb_model(model_obj, model_name: str) -> EncoderProtocol: + """Wrap a model object into MTEB's ModelMeta/EncoderProtocol.""" + model_meta = ModelMeta( + loader=lambda model_name, **kw: model_obj, + loader_kwargs={}, + name=model_name, + revision='latest', + release_date=None, + languages=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=2048, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=[], + similarity_fn_name=ScoringFunction.COSINE, + use_instructions=True, + training_datasets=set(), + ) + return model_meta.load_model() + + +def run_evaluation(model_obj, model_name: str, tasks, output_dir: str, + overwrite: bool = False) -> List[ModelResult]: + """Run MTEB evaluation on selected tasks.""" + model = build_mteb_model(model_obj, model_name) + logger.info(f"Loaded model: {model_name}") + + results = [] + for idx, task in enumerate(tasks): + logger.info(f"[{idx+1}/{len(tasks)}] Evaluating: {task.metadata.name}") + try: + os.environ['HF_DATASETS_OFFLINE'] = '1' + result = mteb.evaluate( + model, tasks=task, + encode_kwargs={"batch_size": 64 * 1024}, + overwrite_strategy='always' if overwrite else 'only-missing', + prediction_folder=output_dir, + ) + except Exception: + os.environ['HF_DATASETS_OFFLINE'] = '0' + result = mteb.evaluate( + model, tasks=task, + encode_kwargs={"batch_size": 64 * 1024}, + overwrite_strategy='always' if overwrite else 'only-missing', + prediction_folder=output_dir, + ) + results.append(result) + torch.cuda.empty_cache() + + return results + + +def print_comparison_table(all_results: dict): + """Print a comparison table: task × model_type → main_score.""" + # Collect all task names and scores from ModelResult objects + all_tasks = set() + score_map = {} # (task, model_type) -> score + for model_type, results in all_results.items(): + for model_result in results: + try: + for task_result in model_result.task_results: + task_name = task_result.task_name + all_tasks.add(task_name) + scores = task_result.get_score() + if scores is not None: + score_map[(task_name, model_type)] = scores + except Exception as e: + logger.warning(f"Error extracting scores for {model_type}: {e}") + + if not all_tasks: + logger.warning("No results to compare") + return + + model_types = list(all_results.keys()) + all_tasks = sorted(all_tasks) + + header = f"{'Task':<45}" + "".join(f"{mt:>20}" for mt in model_types) + print("\n" + "=" * len(header)) + print(" MTEB v2 Evaluation Results Comparison") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + avg_scores = {mt: [] for mt in model_types} + for task in all_tasks: + row = f"{task:<45}" + for mt in model_types: + score = score_map.get((task, mt)) + if score is not None: + row += f"{score:>20.4f}" + avg_scores[mt].append(score) + else: + row += f"{'N/A':>20}" + print(row) + + print("-" * len(header)) + avg_row = f"{'AVERAGE':<45}" + for mt in model_types: + if avg_scores[mt]: + avg_row += f"{np.mean(avg_scores[mt]):>20.4f}" + else: + avg_row += f"{'N/A':>20}" + print(avg_row) + print("=" * len(header)) + + +def main(): + parser = argparse.ArgumentParser(description="MTEB v2 evaluation for bitnet-embeddings-0.6b") + parser.add_argument("--model-dir", type=str, required=True, + help="Path to model directory containing safetensors and GGUF files") + parser.add_argument("--build-dir", type=str, + default="/home/huangxin/code_list/BitNet/build", + help="Path to BitNet build directory") + parser.add_argument("--output-dir", type=str, + default="/home/huangxin/code_list/BitNet/eval_results", + help="Output directory for results") + parser.add_argument("--model-type", type=str, default="all", + choices=["all", "safetensors", "safetensors_noquant", "f16", "i2s"], + help="Which model type(s) to evaluate") + parser.add_argument("--threads", type=int, default=8, + help="Number of threads for llama-embedding") + parser.add_argument("--tasks", type=str, default="", + help="Comma-separated task names (default: selected ~33 tasks)") + parser.add_argument("--dry-run", action="store_true", + help="Run only 1 task for quick test") + parser.add_argument("--overwrite", action="store_true", + help="Overwrite existing results") + parser.add_argument("--gpu", type=str, default="0,1", + help="CUDA visible devices (default: 0,1)") + args = parser.parse_args() + + model_dir = Path(args.model_dir) + os.makedirs(args.output_dir, exist_ok=True) + + # Resolve model files + safetensors_path = model_dir / "model.safetensors" + f16_gguf = model_dir / "bitnet-embeddings-0.6b-f16-new.gguf" + i2s_gguf = model_dir / "bitnet-embeddings-0.6b-f16-new-i2_s.gguf" + + # Select tasks + all_mteb_tasks = list(MTEB_multilingual_v2.tasks) + if args.tasks: + allowed = args.tasks.split(",") + tasks = [t for t in all_mteb_tasks if t.metadata.name in allowed] + else: + tasks = [t for t in all_mteb_tasks if t.metadata.name in SELECTED_TASKS] + + if args.dry_run: + tasks = tasks[:1] + + logger.info(f"Will evaluate {len(tasks)} tasks: {[t.metadata.name for t in tasks]}") + + # Run evaluations + all_results = {} + if args.model_type == "all": + model_types = ["safetensors", "f16", "i2s"] + elif args.model_type == "safetensors": + model_types = ["safetensors"] + else: + model_types = [args.model_type] + + for model_type in model_types: + logger.info(f"\n{'='*60}") + logger.info(f" Evaluating model type: {model_type}") + logger.info(f"{'='*60}") + + out_dir = os.path.join(args.output_dir, model_type) + os.makedirs(out_dir, exist_ok=True) + + if model_type == "safetensors": + if not safetensors_path.exists(): + logger.warning(f"Skipping safetensors: {safetensors_path} not found") + continue + model_obj = SafetensorsModel( + model_name=str(model_dir), pool_type="last", instruct_type="detailed", + disable_quant=False, + ) + name = "microsoft/bitnet-embeddings-0.6b-safetensors" + + elif model_type == "f16": + if not f16_gguf.exists(): + logger.warning(f"Skipping f16: {f16_gguf} not found") + continue + model_obj = GGUFModel( + gguf_path=str(f16_gguf), build_dir=args.build_dir, + instruct_type="detailed", threads=args.threads, + ) + name = "microsoft/bitnet-embeddings-0.6b-f16-gguf" + + elif model_type == "i2s": + if not i2s_gguf.exists(): + logger.warning(f"Skipping i2s: {i2s_gguf} not found") + continue + model_obj = GGUFModel( + gguf_path=str(i2s_gguf), build_dir=args.build_dir, + instruct_type="detailed", threads=args.threads, + ) + name = "microsoft/bitnet-embeddings-0.6b-i2s-gguf" + + results = run_evaluation(model_obj, name, tasks, out_dir, args.overwrite) + all_results[model_type] = results + + # Clean up GPU memory after safetensors models + if model_type.startswith("safetensors"): + del model_obj + torch.cuda.empty_cache() + + # Print comparison + if len(all_results) > 1: + print_comparison_table(all_results) + + logger.info("Done!") + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_gguf_precision.py b/scripts/verify_gguf_precision.py new file mode 100644 index 000000000..7c24dd7b9 --- /dev/null +++ b/scripts/verify_gguf_precision.py @@ -0,0 +1,622 @@ +#!/usr/bin/env python3 +""" +Verify precision alignment between: + 1. model.safetensors (HF original) + 2. converted GGUF file + +Two levels of verification: + Level 1: Tensor-level comparison (weight values) + Level 2: Inference-level comparison (embedding output from PyTorch vs llama.cpp) + +Usage: + python verify_gguf_precision.py \ + --model-dir /path/to/bitnet-embeddings-0.6b \ + --gguf-file /path/to/output.gguf \ + [--level 1|2|both] \ + [--text "hello world"] +""" + +from __future__ import annotations +import argparse +import json +import sys +import os +import logging +import warnings +from pathlib import Path +from contextlib import contextmanager + +import numpy as np +import torch + +# Suppress noisy warnings from transformers / tokenizers BEFORE any transformers import +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1" +os.environ["TRANSFORMERS_VERBOSITY"] = "error" +warnings.filterwarnings("ignore") +logging.getLogger("transformers").setLevel(logging.CRITICAL) +logging.getLogger("transformers.modeling_utils").setLevel(logging.CRITICAL) + + +@contextmanager +def suppress_stderr(): + """Redirect stderr to /dev/null during model loading.""" + old_stderr = sys.stderr + sys.stderr = open(os.devnull, 'w') + try: + yield + finally: + sys.stderr.close() + sys.stderr = old_stderr + + +# ────────────────────────────────────────────────────────────────────── +# Level 1: Tensor-level comparison +# ────────────────────────────────────────────────────────────────────── + +def build_name_map(n_layers: int) -> dict[str, str]: + """GGUF tensor name -> HF tensor name.""" + m = { + "token_embd.weight": "embed_tokens.weight", + "output_norm.weight": "norm.weight", + "output.weight": "embed_tokens.weight", + } + for i in range(n_layers): + p, b = f"layers.{i}", f"blk.{i}" + m.update({ + f"{b}.attn_norm.weight": f"{p}.input_layernorm.weight", + f"{b}.ffn_norm.weight": f"{p}.post_attention_layernorm.weight", + f"{b}.attn_q.weight": f"{p}.self_attn.q_proj.weight", + f"{b}.attn_k.weight": f"{p}.self_attn.k_proj.weight", + f"{b}.attn_v.weight": f"{p}.self_attn.v_proj.weight", + f"{b}.attn_output.weight": f"{p}.self_attn.o_proj.weight", + f"{b}.attn_q_norm.weight": f"{p}.self_attn.q_norm.weight", + f"{b}.attn_k_norm.weight": f"{p}.self_attn.k_norm.weight", + f"{b}.attn_q_norm_in.weight": f"{p}.self_attn.q_proj.norm.weight", + f"{b}.attn_k_norm_in.weight": f"{p}.self_attn.k_proj.norm.weight", + f"{b}.attn_v_norm_in.weight": f"{p}.self_attn.v_proj.norm.weight", + f"{b}.attn_output_norm_in.weight": f"{p}.self_attn.o_proj.norm.weight", + f"{b}.ffn_gate.weight": f"{p}.mlp.gate_proj.weight", + f"{b}.ffn_up.weight": f"{p}.mlp.up_proj.weight", + f"{b}.ffn_down.weight": f"{p}.mlp.down_proj.weight", + f"{b}.ffn_gate_norm_in.weight": f"{p}.mlp.gate_proj.norm.weight", + f"{b}.ffn_up_norm_in.weight": f"{p}.mlp.up_proj.norm.weight", + f"{b}.ffn_down_norm_in.weight": f"{p}.mlp.down_proj.norm.weight", + }) + return m + + +def verify_tensors(model_dir: Path, gguf_file: Path): + """Compare every tensor value between safetensors and GGUF.""" + from safetensors import safe_open + from gguf import GGUFReader + + print("=" * 70) + print("Level 1: Tensor-level precision verification") + print("=" * 70) + + with open(model_dir / "config.json") as f: + config = json.load(f) + n_layers = config["num_hidden_layers"] + + # GGUF name -> HF name + name_map = build_name_map(n_layers) + + # Load GGUF tensors + reader = GGUFReader(str(gguf_file)) + gguf_tensors = {t.name: t for t in reader.tensors} + + # Load safetensors + sf = safe_open(str(model_dir / "model.safetensors"), framework="pt", device="cpu") + hf_keys = set(sf.keys()) + + total = 0 + passed = 0 + failed = 0 + max_global_err = 0.0 + failures = [] + + for gguf_name, gguf_tensor in sorted(gguf_tensors.items()): + hf_name = name_map.get(gguf_name) + if hf_name is None: + print(f" [SKIP] {gguf_name} — no HF mapping") + continue + if hf_name not in hf_keys: + print(f" [SKIP] {gguf_name} — HF tensor '{hf_name}' not found") + continue + + total += 1 + hf_data = sf.get_tensor(hf_name).float().numpy() + gguf_data = gguf_tensor.data.copy() + + # Skip quantized tensors (I2_S/TL1/TL2) — they are packed and cannot be compared element-wise + gguf_type_name = gguf_tensor.tensor_type.name if hasattr(gguf_tensor, 'tensor_type') else "" + if gguf_type_name in ("I2_S", "TL1", "TL2"): + print(f" [SKIP] {gguf_name} — quantized ({gguf_type_name}), not comparable at tensor level") + total -= 1 + continue + + # For f16 GGUF tensors: compare against bf16->f16 of original + if gguf_data.dtype == np.float16: + hf_cmp = hf_data.astype(np.float16).astype(np.float32) + gguf_cmp = gguf_data.astype(np.float32) + else: + # f32: compare against bf16->f32 of original + hf_cmp = hf_data + gguf_cmp = gguf_data.astype(np.float32) + + # Shape check + if hf_cmp.shape != gguf_cmp.shape: + print(f" [FAIL] {gguf_name}: shape mismatch HF={hf_cmp.shape} GGUF={gguf_cmp.shape}") + failed += 1 + failures.append(gguf_name) + continue + + max_err = np.max(np.abs(hf_cmp - gguf_cmp)) + mean_err = np.mean(np.abs(hf_cmp - gguf_cmp)) + max_global_err = max(max_global_err, max_err) + + status = "OK" if max_err < 1e-3 else "FAIL" + if status == "FAIL": + failed += 1 + failures.append(gguf_name) + else: + passed += 1 + + # Only print failures and a summary for passing ones + if status == "FAIL" or max_err > 0: + print(f" [{status}] {gguf_name}: max_err={max_err:.6e}, mean_err={mean_err:.6e}, " + f"shape={gguf_data.shape}, dtype={gguf_data.dtype}") + + print(f"\n Total: {total}, Passed: {passed}, Failed: {failed}") + print(f" Global max error: {max_global_err:.6e}") + if max_global_err == 0: + print(" ✅ PERFECT: All tensor values match exactly!") + elif max_global_err < 1e-3: + print(" ✅ GOOD: All tensor values within f16 tolerance") + else: + print(" ❌ FAILED: Some tensors have unacceptable error") + for f in failures: + print(f" - {f}") + + return failed == 0 + + +# ────────────────────────────────────────────────────────────────────── +# Level 2: Inference-level comparison (PyTorch vs llama.cpp embeddings) +# ────────────────────────────────────────────────────────────────────── + +def run_pytorch_embedding(model_dir: Path, texts: list[str], disable_quant: bool = False) -> np.ndarray: + """Run PyTorch inference with BitNet model to get embeddings. + + Args: + model_dir: HuggingFace model directory with safetensors + bitnet_config.json + texts: list of texts to embed + disable_quant: if True, disable BitLinear activation/weight quantization + (for comparing with f16 GGUF which stores full-precision weights). + if False (default), use full BitLinear quantization + (for comparing with i2_s GGUF which stores ternary weights). + """ + quant_label = "without" if disable_quant else "with" + from transformers import AutoTokenizer, AutoModel, AutoConfig + import torch.nn.functional as F + + device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + print(f"\n Running PyTorch inference ({quant_label} BitLinear quantization) on [{device}]...") + + # Add bitnet-embeddings source to path + bitnet_src = Path("/home/huangxin/code_list/bitnet-embeddings-v260420/src") + if bitnet_src.exists() and str(bitnet_src) not in sys.path: + sys.path.insert(0, str(bitnet_src)) + + with suppress_stderr(): + tokenizer = AutoTokenizer.from_pretrained(str(model_dir)) + model = AutoModel.from_pretrained(str(model_dir), torch_dtype=torch.float32) + + # Apply BitLinear replacement and reload weights (same as simple_encoder.py) + bitnet_config_file = model_dir / "bitnet_config.json" + if bitnet_config_file.exists(): + from bitnet import replace_linear_with_bitlinear, BitLinear + with open(bitnet_config_file) as f: + bitnet_config = json.load(f) + + if disable_quant: + # Disable activation_quant and weight_quant for f16 GGUF comparison + original_forward = BitLinear.forward + def no_quant_forward(self, x): + w = self.weight + x_norm = self.norm(x) if self.should_rms else x + return torch.nn.functional.linear(x_norm, w, self.bias) + BitLinear.forward = no_quant_forward + + with suppress_stderr(): + model = replace_linear_with_bitlinear( + model, + weight_quant_method=bitnet_config.get('weight_quant_method', 'minmax'), + standard_bitnet=bitnet_config.get('standard_bitnet', False), + skip_layer_keywords=bitnet_config.get('skip_layer_keywords', []), + ) + # Reload weights to pick up the .norm.weight tensors + import safetensors.torch + import glob + shard_files = sorted(glob.glob(str(model_dir / "*.safetensors"))) + if shard_files: + full_state = {} + for sf in shard_files: + full_state.update(safetensors.torch.load_file(sf, device=str(device))) + model.load_state_dict(full_state, strict=True) + print(" ✅ BitLinear applied and weights reloaded") + else: + print(" ⚠️ No bitnet_config.json found, using standard model (norms will be ignored)") + + model = model.to(device) + model.eval() + + inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512) + inputs = {k: v.to(device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = model(**inputs) + # Mean pooling over non-padding tokens + attention_mask = inputs["attention_mask"] + token_embeddings = outputs.last_hidden_state + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( + input_mask_expanded.sum(1), min=1e-9 + ) + embeddings = F.normalize(embeddings, p=2, dim=1) + + return embeddings.cpu().numpy() + + +def run_llamacpp_embedding(gguf_file: Path, texts: list[str], build_dir: Path) -> np.ndarray: + """Run llama.cpp embedding binary.""" + print(" Running llama.cpp inference on [CPU]...") + import subprocess + import tempfile + + embd_bin = build_dir / "bin" / "llama-embedding" + if not embd_bin.exists(): + print(f" [SKIP] {embd_bin} not found, cannot run llama.cpp inference") + return None + + # Run per-text to match PyTorch's per-text inference (avoid padding differences) + all_embeddings = [] + for text in texts: + result = subprocess.run( + [str(embd_bin), "-m", str(gguf_file), "-p", text, + "--embd-normalize", "2", "--embd-output-format", "json", + "--pooling", "mean"], + capture_output=True, text=True, timeout=120 + ) + if result.returncode != 0: + print(f" [ERROR] llama-embedding failed:\n{result.stderr[:500]}") + return None + + output = result.stdout.strip() + try: + data = json.loads(output) + if isinstance(data, dict) and "data" in data: + all_embeddings.append(data["data"][0]["embedding"]) + elif isinstance(data, list): + all_embeddings.append(data[0] if isinstance(data[0], list) else data) + except json.JSONDecodeError: + print(f" [ERROR] Could not parse embedding output") + return None + + return np.array(all_embeddings, dtype=np.float32) + + +def verify_inference(model_dir: Path, gguf_file: Path, texts: list[str], build_dir: Path): + """Compare PyTorch and llama.cpp embedding outputs. + + Automatically detects GGUF type: + - f16 GGUF: disable PyTorch quantization (both sides use full-precision weights) + - i2_s GGUF: enable PyTorch quantization (both sides use ternary weights) + """ + print("=" * 70) + print("Level 2: Inference-level precision verification") + print("=" * 70) + + # Auto-detect: if GGUF filename contains "i2_s", both sides should quantize + gguf_name = gguf_file.name + is_quantized = "i2_s" in gguf_name or "tl1" in gguf_name or "tl2" in gguf_name + disable_quant = not is_quantized + + if is_quantized: + print(f" Detected quantized GGUF ({gguf_name})") + print(f" → PyTorch: BitLinear WITH quantization (weight_quant + activation_quant)") + print(f" → llama.cpp: ternary weights from GGUF") + else: + print(f" Detected f16/f32 GGUF ({gguf_name})") + print(f" → PyTorch: BitLinear WITHOUT quantization (full-precision weights)") + print(f" → llama.cpp: full-precision weights from GGUF") + + pt_emb = run_pytorch_embedding(model_dir, texts, disable_quant=disable_quant) + cpp_emb = run_llamacpp_embedding(gguf_file, texts, build_dir) + + if cpp_emb is None: + print(" ⚠️ Skipped: could not get llama.cpp embeddings") + return False + + print(f"\n PyTorch embedding shape: {pt_emb.shape} (computed on GPU, model.safetensors)") + print(f" llama.cpp embedding shape: {cpp_emb.shape} (computed on CPU, {gguf_file.name})") + + if pt_emb.shape != cpp_emb.shape: + print(f" ❌ Shape mismatch!") + # Try to compare if dimensions match + min_dim = min(pt_emb.shape[-1], cpp_emb.shape[-1]) + print(f" Comparing first {min_dim} dimensions...") + pt_emb = pt_emb[:, :min_dim] + cpp_emb = cpp_emb[:, :min_dim] + + all_ok = True + for i, text in enumerate(texts): + a, b = pt_emb[i], cpp_emb[i] + diff = np.abs(a - b) + max_err = np.max(diff) + mean_err = np.mean(diff) + + # Both vectors are L2-normalized, so cos_sim = dot(a, b) + cos_sim = float(np.dot(a, b)) + + print(f"\n Text: \"{text[:50]}...\"" if len(text) > 50 else f"\n Text: \"{text}\"") + print(f" max_abs_err: {max_err:.6e}") + print(f" mean_abs_err: {mean_err:.6e}") + print(f" cos_sim: {cos_sim:.8f}") + + # Judgment thresholds depend on whether we're comparing quantized models + if is_quantized: + # i2_s GGUF vs PyTorch BitLinear: weight quantization matches, + # but activation_quant (8-bit) exists in PyTorch but not in llama.cpp, + # plus bf16->f16 precision loss. cos_sim > 0.95 is good. + if cos_sim > 0.99: + print(f" ✅ Excellent match") + elif cos_sim > 0.97: + print(f" ✅ Good match") + elif cos_sim > 0.95: + print(f" ⚠️ Acceptable (activation_quant + bf16→f16 differences)") + else: + print(f" ❌ Poor — check conversion") + all_ok = False + else: + # f16 GGUF vs PyTorch (no quant): only bf16→f16 + pooling accumulation + if cos_sim > 0.999: + print(f" ✅ Excellent match") + elif cos_sim > 0.99: + print(f" ✅ Good match (bf16→f16 + pooling accumulation)") + elif cos_sim > 0.98: + print(f" ⚠️ Acceptable (short text amplifies per-token error)") + else: + print(f" ❌ Poor — check conversion") + all_ok = False + + return all_ok + + +# ────────────────────────────────────────────────────────────────────── +# Level 3: Per-token hidden state comparison +# ────────────────────────────────────────────────────────────────────── + +def run_pytorch_per_token(model_dir: Path, texts: list[str], disable_quant: bool = False) -> list[np.ndarray]: + """Run PyTorch inference and return per-token hidden states (un-pooled, un-normalized). + + Returns: + List of arrays, each of shape [seq_len, hidden_dim]. + """ + quant_label = "without" if disable_quant else "with" + from transformers import AutoTokenizer, AutoModel + import safetensors.torch + import glob + + device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + print(f"\n Running PyTorch per-token inference ({quant_label} quantization) on [{device}]...") + + bitnet_src = Path("/home/huangxin/code_list/bitnet-embeddings-v260420/src") + if bitnet_src.exists() and str(bitnet_src) not in sys.path: + sys.path.insert(0, str(bitnet_src)) + + with suppress_stderr(): + tokenizer = AutoTokenizer.from_pretrained(str(model_dir)) + model = AutoModel.from_pretrained(str(model_dir), torch_dtype=torch.float32) + + bitnet_config_file = model_dir / "bitnet_config.json" + if bitnet_config_file.exists(): + from bitnet import replace_linear_with_bitlinear, BitLinear + + if disable_quant: + original_forward = BitLinear.forward + def no_quant_forward(self, x): + w = self.weight + x_norm = self.norm(x) if self.should_rms else x + return torch.nn.functional.linear(x_norm, w, self.bias) + BitLinear.forward = no_quant_forward + + with open(bitnet_config_file) as f: + bitnet_config = json.load(f) + with suppress_stderr(): + model = replace_linear_with_bitlinear( + model, + weight_quant_method=bitnet_config.get('weight_quant_method', 'minmax'), + standard_bitnet=bitnet_config.get('standard_bitnet', False), + skip_layer_keywords=bitnet_config.get('skip_layer_keywords', []), + ) + shard_files = sorted(glob.glob(str(model_dir / "*.safetensors"))) + if shard_files: + full_state = {} + for sf in shard_files: + full_state.update(safetensors.torch.load_file(sf, device=str(device))) + model.load_state_dict(full_state, strict=True) + model = model.to(device) + model.eval() + + all_hidden = [] + for text in texts: + inp = tokenizer([text], return_tensors='pt', padding=False, truncation=True, max_length=512) + inp = {k: v.to(device) for k, v in inp.items()} + with torch.no_grad(): + out = model(**inp).last_hidden_state[0].cpu().numpy() # [seq_len, hidden_dim] + all_hidden.append(out) + + return all_hidden + + +def run_llamacpp_per_token(gguf_file: Path, texts: list[str], build_dir: Path) -> list[np.ndarray]: + """Run llama.cpp and return per-token hidden states (un-pooled, un-normalized). + + Returns: + List of arrays, each of shape [seq_len, hidden_dim]. + """ + import subprocess + + print(" Running llama.cpp per-token inference on [CPU]...") + embd_bin = build_dir / "bin" / "llama-embedding" + if not embd_bin.exists(): + print(f" [SKIP] {embd_bin} not found") + return None + + all_hidden = [] + for text in texts: + result = subprocess.run( + [str(embd_bin), "-m", str(gguf_file), "-p", text, + "--embd-normalize", "-1", "--embd-output-format", "array"], + capture_output=True, text=True, timeout=120 + ) + if result.returncode != 0: + print(f" [ERROR] {result.stderr[:200]}") + return None + stdout = result.stdout.strip() + if not stdout: + print(f" [ERROR] Empty stdout for '{text[:30]}'") + return None + all_hidden.append(np.array(json.loads(stdout), dtype=np.float32)) + + return all_hidden + + +def verify_per_token(model_dir: Path, gguf_file: Path, texts: list[str], build_dir: Path): + """Compare per-token hidden states between PyTorch and llama.cpp. + + This is the most precise comparison — no pooling or normalization + to accumulate or amplify errors. + """ + print("=" * 70) + print("Level 3: Per-token hidden state comparison") + print("=" * 70) + + # Auto-detect quantization mode + gguf_name = gguf_file.name + is_quantized = "i2_s" in gguf_name or "tl1" in gguf_name or "tl2" in gguf_name + disable_quant = not is_quantized + + if is_quantized: + print(f" Detected quantized GGUF → PyTorch WITH quantization") + else: + print(f" Detected f16/f32 GGUF → PyTorch WITHOUT quantization") + + pt_hidden = run_pytorch_per_token(model_dir, texts, disable_quant=disable_quant) + cpp_hidden = run_llamacpp_per_token(gguf_file, texts, build_dir) + + if cpp_hidden is None: + print(" ⚠️ Skipped: could not get llama.cpp hidden states") + return False + + all_ok = True + for i, text in enumerate(texts): + pt_h = pt_hidden[i] # [seq_len_pt, hidden_dim] + cpp_h = cpp_hidden[i] # [seq_len_cpp, hidden_dim] + + n_tokens = min(len(pt_h), len(cpp_h)) + # Align from the beginning (skip trailing EOS/padding difference) + pt_sub = pt_h[:n_tokens] + cpp_sub = cpp_h[:n_tokens] + + cos_sims = [] + max_errs = [] + mean_errs = [] + for t in range(n_tokens): + d = np.dot(pt_sub[t], cpp_sub[t]) + n = np.linalg.norm(pt_sub[t]) * np.linalg.norm(cpp_sub[t]) + 1e-9 + cos_sims.append(d / n) + diff = np.abs(pt_sub[t] - cpp_sub[t]) + max_errs.append(np.max(diff)) + mean_errs.append(np.mean(diff)) + + avg_cos = np.mean(cos_sims) + min_cos = np.min(cos_sims) + avg_max_err = np.mean(max_errs) + avg_mean_err = np.mean(mean_errs) + + print(f"\n Text: \"{text[:50]}...\"" if len(text) > 50 else f"\n Text: \"{text}\"") + print(f" tokens: PT={len(pt_h)}(GPU,safetensors), CPP={len(cpp_h)}(CPU,GGUF), compared={n_tokens}") + print(f" avg_max_err: {avg_max_err:.6e}") + print(f" avg_mean_err: {avg_mean_err:.6e}") + print(f" avg_cos_sim: {avg_cos:.8f}") + print(f" min_cos_sim: {min_cos:.8f}") + + # Per-token detail + for t in range(n_tokens): + print(f" token[{t}]: cos={cos_sims[t]:.8f}, max_err={max_errs[t]:.6e}, mean_err={mean_errs[t]:.6e}") + + if min_cos > 0.9999: + print(f" ✅ Excellent precision alignment") + elif min_cos > 0.999: + print(f" ✅ Good precision alignment") + elif min_cos > 0.99: + print(f" ⚠️ Acceptable (bf16→f16 accumulated loss)") + else: + print(f" ❌ Poor — check conversion") + all_ok = False + + return all_ok + + +# ────────────────────────────────────────────────────────────────────── +# Main +# ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="Verify GGUF conversion precision") + parser.add_argument("--model-dir", type=Path, required=True, help="HF model directory") + parser.add_argument("--gguf-file", type=Path, required=True, help="GGUF file to verify") + parser.add_argument("--level", choices=["1", "2", "3", "both", "all"], default="all", + help="Verification level: 1=tensor, 2=embedding, 3=per-token, both=1+2, all=1+2+3") + parser.add_argument("--build-dir", type=Path, + default=Path(__file__).parent.parent / "build", + help="llama.cpp build directory") + parser.add_argument("--text", type=str, nargs="+", + default=["你觉得人工智能的未来是什么样的?", + "The quick brown fox jumps over the lazy dog", + "机器学习是人工智能的一个分支"], + help="Test texts for inference comparison") + args = parser.parse_args() + + print(f"Model dir: {args.model_dir}") + print(f"GGUF file: {args.gguf_file}") + print() + + ok = True + + if args.level in ("1", "all"): + ok = verify_tensors(args.model_dir, args.gguf_file) and ok + + if args.level in ("2", "all"): + print() + ok = verify_inference(args.model_dir, args.gguf_file, args.text, args.build_dir) and ok + + if args.level in ("3", "all"): + print() + ok = verify_per_token(args.model_dir, args.gguf_file, args.text, args.build_dir) and ok + + print("\n" + "=" * 70) + if ok: + print("Overall: ✅ PASSED") + else: + print("Overall: ❌ ISSUES FOUND") + print("=" * 70) + + sys.exit(0 if ok else 1) + + +if __name__ == "__main__": + main() diff --git a/src/ggml-bitnet-lut.cpp b/src/ggml-bitnet-lut.cpp index 59422d548..dc1f1d645 100644 --- a/src/ggml-bitnet-lut.cpp +++ b/src/ggml-bitnet-lut.cpp @@ -5,6 +5,10 @@ #include #include +#ifdef __x86_64__ +#include +#endif + #include "ggml-bitnet.h" #include "ggml-quants.h" #include "bitnet-lut-kernels.h" diff --git a/src/ggml-bitnet-mad.cpp b/src/ggml-bitnet-mad.cpp index 4ba9d6509..ad18bac04 100644 --- a/src/ggml-bitnet-mad.cpp +++ b/src/ggml-bitnet-mad.cpp @@ -808,7 +808,7 @@ void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size accu[iy] = _mm256_setzero_si256(); } - int8_t * y_col = y + col * by; + const int8_t * y_col = y + col * by; for (int i = 0; i < group32_num; i++) { const uint8_t *px = x + i * 1024; diff --git a/utils/convert-bitnet-embedding-to-gguf.py b/utils/convert-bitnet-embedding-to-gguf.py new file mode 100644 index 000000000..74bb1e1be --- /dev/null +++ b/utils/convert-bitnet-embedding-to-gguf.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 +""" +Convert bitnet-embeddings-0.6b (Qwen3 + BitNet per-projection norm) from +HuggingFace safetensors to GGUF format. + +Supports: + --outtype f16 : float16 weights (norms in f32) + --outtype f32 : float32 everything + --outtype i2_s : ternary quantized (I2_S layout, platform-independent) + +Usage: + python convert-bitnet-embedding-to-gguf.py \ + /path/to/bitnet-embeddings-0.6b \ + --outfile output.gguf \ + --outtype f16 + + # Ternary quantized (i2_s): + python convert-bitnet-embedding-to-gguf.py \ + /path/to/bitnet-embeddings-0.6b \ + --outfile output-i2_s.gguf \ + --outtype i2_s +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from hashlib import sha256 +from pathlib import Path +from typing import Any, Iterator + +import numpy as np +import torch + +# Allow using the local gguf-py if present +if "NO_LOCAL_GGUF" not in os.environ: + _local_gguf = Path(__file__).parent / "gguf-py" + if _local_gguf.exists(): + sys.path.insert(1, str(_local_gguf)) +import gguf + +logger = logging.getLogger("convert-bitnet-embedding") + +# --------------------------------------------------------------------------- +# Tensor name mapping: HuggingFace -> GGUF +# --------------------------------------------------------------------------- + +def build_tensor_name_map(n_layers: int) -> dict[str, str]: + """Build HF tensor name -> GGUF tensor name mapping.""" + mapping: dict[str, str] = { + "embed_tokens.weight": "token_embd.weight", + "norm.weight": "output_norm.weight", + } + + for i in range(n_layers): + pfx = f"layers.{i}" + blk = f"blk.{i}" + + mapping.update({ + # Layer norms + f"{pfx}.input_layernorm.weight": f"{blk}.attn_norm.weight", + f"{pfx}.post_attention_layernorm.weight": f"{blk}.ffn_norm.weight", + + # Self-attention projections + f"{pfx}.self_attn.q_proj.weight": f"{blk}.attn_q.weight", + f"{pfx}.self_attn.k_proj.weight": f"{blk}.attn_k.weight", + f"{pfx}.self_attn.v_proj.weight": f"{blk}.attn_v.weight", + f"{pfx}.self_attn.o_proj.weight": f"{blk}.attn_output.weight", + + # QK head norms (standard Qwen3) + f"{pfx}.self_attn.q_norm.weight": f"{blk}.attn_q_norm.weight", + f"{pfx}.self_attn.k_norm.weight": f"{blk}.attn_k_norm.weight", + + # Per-projection input norms (BitNet-specific) + f"{pfx}.self_attn.q_proj.norm.weight": f"{blk}.attn_q_norm_in.weight", + f"{pfx}.self_attn.k_proj.norm.weight": f"{blk}.attn_k_norm_in.weight", + f"{pfx}.self_attn.v_proj.norm.weight": f"{blk}.attn_v_norm_in.weight", + f"{pfx}.self_attn.o_proj.norm.weight": f"{blk}.attn_output_norm_in.weight", + + # MLP projections + f"{pfx}.mlp.gate_proj.weight": f"{blk}.ffn_gate.weight", + f"{pfx}.mlp.up_proj.weight": f"{blk}.ffn_up.weight", + f"{pfx}.mlp.down_proj.weight": f"{blk}.ffn_down.weight", + + # Per-projection input norms for MLP (BitNet-specific) + f"{pfx}.mlp.gate_proj.norm.weight": f"{blk}.ffn_gate_norm_in.weight", + f"{pfx}.mlp.up_proj.norm.weight": f"{blk}.ffn_up_norm_in.weight", + f"{pfx}.mlp.down_proj.norm.weight": f"{blk}.ffn_down_norm_in.weight", + }) + + return mapping + + +# --------------------------------------------------------------------------- +# Tokenizer handling (GPT-2 / BPE for Qwen3) +# --------------------------------------------------------------------------- + +def get_vocab_base_pre(tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n\U0001f680 (normal) \U0001f636‍\U0001f32b️ (multiple emojis concatenated) ✅ \U0001f999\U0001f999 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច\U0001f601 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": + # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B + res = "qwen2" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + + +def _does_token_look_special(token: str) -> bool: + """Check if a token looks like a special token (e.g., <|...|>, <...>).""" + if not token: + return False + # Matches patterns like <|endoftext|>, , , [CLS], [SEP], etc. + if token.startswith(("<|", "<", "[")) and token.endswith(("|>", ">", "]")): + return True + return False + + +def set_vocab(gguf_writer: gguf.GGUFWriter, dir_model: Path, hparams: dict): + """Set GPT-2 BPE vocab for Qwen3.""" + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + + tokpre = get_vocab_base_pre(tokenizer) + + tokens: list[str] = [] + toktypes: list[int] = [] + + reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + token = reverse_vocab[i] + + # Only encode-decode non-normalized tokens (matching llama.cpp upstream) + if not added_tokens_decoder[i].normalized: + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + + if added_tokens_decoder[i].special or _does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # Pre-normalize user-defined spaces (for Gemma-style tokenizers) + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + toktypes.append(gguf.TokenType.USER_DEFINED) + + tokens.append(token) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + gguf_writer.add_tokenizer_model("gpt2") + gguf_writer.add_tokenizer_pre(tokpre) + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + # Override EOS token: PyTorch tokenizer appends <|endoftext|> (151643) as the + # sentence-end marker, not <|im_end|> (151645). For last-token pooling to work + # correctly, llama.cpp must append the same token. + special_vocab.special_token_ids["eos"] = 151643 + special_vocab.add_to_gguf(gguf_writer) + + # Embedding models need EOS token appended for last-token pooling + gguf_writer.add_add_eos_token(True) + + +# --------------------------------------------------------------------------- +# GGUF metadata +# --------------------------------------------------------------------------- + +def set_gguf_parameters(gguf_writer: gguf.GGUFWriter, hparams: dict, dir_model: Path, ftype: int): + gguf_writer.add_name(dir_model.name) + + n_layers = hparams["num_hidden_layers"] + n_embd = hparams["hidden_size"] + n_head = hparams["num_attention_heads"] + n_head_kv = hparams.get("num_key_value_heads", n_head) + n_ff = hparams["intermediate_size"] + + gguf_writer.add_block_count(n_layers) + gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768)) + gguf_writer.add_embedding_length(n_embd) + gguf_writer.add_feed_forward_length(n_ff) + gguf_writer.add_head_count(n_head) + gguf_writer.add_head_count_kv(n_head_kv) + gguf_writer.add_vocab_size(hparams["vocab_size"]) + + head_dim = hparams.get("head_dim", n_embd // n_head) + gguf_writer.add_rope_dimension_count(head_dim) + gguf_writer.add_key_length(head_dim) + gguf_writer.add_value_length(head_dim) + + if hparams.get("rope_theta") is not None: + gguf_writer.add_rope_freq_base(hparams["rope_theta"]) + if hparams.get("rms_norm_eps") is not None: + gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + + gguf_writer.add_file_type(ftype) + + # Pooling type for embedding models + # Try to read from modules.json / 1_Pooling/config.json (sentence-transformers convention) + pooling_type = None + module_path = dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"].endswith("Pooling"): + pooling_path = dir_model / mod["path"] / "config.json" + if pooling_path.is_file(): + with open(pooling_path, encoding="utf-8") as f: + pooling = json.load(f) + if pooling.get("pooling_mode_mean_tokens"): + pooling_type = gguf.PoolingType.MEAN + elif pooling.get("pooling_mode_cls_token"): + pooling_type = gguf.PoolingType.CLS + elif pooling.get("pooling_mode_lasttoken"): + pooling_type = gguf.PoolingType.LAST + break + if pooling_type is None: + # Default to MEAN pooling for embedding models + logger.info(" No pooling config found, defaulting to MEAN pooling") + pooling_type = gguf.PoolingType.MEAN + gguf_writer.add_pooling_type(pooling_type) + + logger.info(f" n_layers={n_layers}, n_embd={n_embd}, n_head={n_head}, n_head_kv={n_head_kv}, n_ff={n_ff}") + + +# --------------------------------------------------------------------------- +# Tensor iteration from safetensors +# --------------------------------------------------------------------------- + +def iter_tensors(dir_model: Path) -> Iterator[tuple[str, torch.Tensor]]: + """Yield (name, tensor) from safetensors files.""" + from safetensors import safe_open + + safetensor_files = sorted(dir_model.glob("*.safetensors")) + if not safetensor_files: + raise FileNotFoundError(f"No .safetensors files in {dir_model}") + + for sf_path in safetensor_files: + logger.info(f"Loading {sf_path.name}") + with safe_open(str(sf_path), framework="pt", device="cpu") as f: + for name in f.keys(): + yield name, f.get_tensor(name) + + +# --------------------------------------------------------------------------- +# I2_S ternary packing (platform-independent) +# --------------------------------------------------------------------------- +# +# I2_S format (from dequantize_row_i2_s in ggml-quants.c): +# - Every 128 values form a block, packed into 32 bytes +# - Each byte stores 4 values at positions [0*32+gp, 1*32+gp, 2*32+gp, 3*32+gp] +# where gp is the byte index within the 32-byte group +# - Encoding per byte: c0=(b>>6)&3, c1=(b>>4)&3, c2=(b>>2)&3, c3=(b>>0)&3 +# - Value mapping: 0 -> -1, 1 -> 0, 2 -> +1, 3 -> 0 +# - Scale is stored as a separate tensor (tensor_name + "_scale") + +def quantize_to_i2_s(w: np.ndarray) -> np.ndarray: + """Quantize float weights to ternary and pack into I2_S layout. + + Uses the same quantization as BitLinear weight_quant_minmax(): + scale = 1.0 / mean(|w|) + q = round(w * scale).clamp(-1, 1) + dequant = q / scale = q * mean(|w|) + + The I2_S format is self-contained: packed ternary bytes followed by a f32 scale + appended at the end of the data buffer. + + Args: + w: float weight tensor of shape (M, K) + + Returns: + packed_data: uint8 array containing I2_S packed bytes + scale (as 4 trailing bytes) + """ + M, K = w.shape + n = M * K + w_flat = w.flatten().astype(np.float32) + + # BitLinear weight_quant_minmax: scale = 1/mean(|w|), then round & clamp + abs_mean = np.mean(np.abs(w_flat)) + abs_mean = max(abs_mean, 1e-5) + inv_scale = 1.0 / abs_mean + q_float = np.round(w_flat * inv_scale).clip(-1, 1) # ternary: {-1, 0, 1} + + # scale for dequantization = abs_mean (i.e., dequant = q * abs_mean) + scale = np.float32(abs_mean) + + # Map ternary {-1, 0, 1} -> I2_S encoding {0, 1, 2} + # -1 -> 0, 0 -> 1, +1 -> 2 + q = np.ones(n, dtype=np.uint8) # default to 1 (zero) + q[q_float > 0.5] = 2 # +1 -> 2 + q[q_float < -0.5] = 0 # -1 -> 0 + + # Pack into I2_S layout: 128-value blocks, interleaved into 32 bytes + # Pad to multiple of 128 + pad_len = (128 - n % 128) % 128 + if pad_len: + q = np.pad(q, (0, pad_len), constant_values=1) + + n_padded = len(q) + n_blocks = n_padded // 128 + + q = q.reshape(n_blocks, 4, 32) + + # Pack: byte = (c0 << 6) | (c1 << 4) | (c2 << 2) | c3 + packed = (q[:, 0, :].astype(np.uint8) << 6) | \ + (q[:, 1, :].astype(np.uint8) << 4) | \ + (q[:, 2, :].astype(np.uint8) << 2) | \ + (q[:, 3, :].astype(np.uint8)) + + packed = packed.reshape(-1).astype(np.uint8) + + # I2_S format: packed_bytes + 32-byte aligned tail (scale in first 4 bytes of tail) + # Total size = n_elements / 4 + 32 (as defined in ggml.c) + packed_size = n // 4 + total_size = packed_size + 32 + result = np.zeros(total_size, dtype=np.uint8) + result[:len(packed)] = packed[:packed_size] + # Write scale as float32 at offset packed_size + result[packed_size:packed_size+4] = np.frombuffer(scale.tobytes(), dtype=np.uint8) + + return result + + +# --------------------------------------------------------------------------- +# Main conversion +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Convert bitnet-embeddings to GGUF") + parser.add_argument("model", type=Path, help="Model directory") + parser.add_argument("--outfile", type=Path, default=None, help="Output GGUF file") + parser.add_argument("--outtype", choices=["f32", "f16", "i2_s"], default="f16", + help="Output type: f32, f16, or i2_s (ternary quantized)") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + dir_model = args.model + if not dir_model.is_dir(): + logger.error(f"{dir_model} is not a directory") + sys.exit(1) + + # Default output filename + if args.outfile is None: + suffix = {"f32": "-f32", "f16": "-f16", "i2_s": "-f16-new-i2_s"}[args.outtype] + args.outfile = dir_model / f"{dir_model.name}{suffix}.gguf" + + # Load config + with open(dir_model / "config.json") as f: + hparams = json.load(f) + + arch = hparams.get("model_type", "qwen3") + assert arch == "qwen3", f"Expected qwen3 architecture, got {arch}" + + n_layers = hparams["num_hidden_layers"] + + # Determine ftype + if args.outtype == "f32": + ftype = 0 # GGML F32 + elif args.outtype == "f16": + ftype = 1 # GGML F16 + else: # i2_s + ftype = 40 # LLAMA_FTYPE_MOSTLY_I2_S + + logger.info(f"Converting {dir_model.name} to GGUF ({args.outtype})") + + # Create GGUF writer + gguf_writer = gguf.GGUFWriter(str(args.outfile), "qwen3") + + # Set parameters + set_gguf_parameters(gguf_writer, hparams, dir_model, ftype) + + # Set vocab + logger.info("Setting tokenizer/vocab...") + set_vocab(gguf_writer, dir_model, hparams) + + # Build tensor name map + tensor_map = build_tensor_name_map(n_layers) + + # Process tensors + logger.info("Processing tensors...") + tensor_count = 0 + for hf_name, data_torch in iter_tensors(dir_model): + # Skip tensors we don't need + if hf_name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Strip "model." prefix if present + name = hf_name + if name.startswith("model."): + name = name[len("model."):] + + # Look up GGUF name + gguf_name = tensor_map.get(name) + if gguf_name is None: + logger.warning(f"Skipping unmapped tensor: {hf_name}") + continue + + old_dtype = data_torch.dtype + + # Convert bf16 -> f32 first (bf16 not directly supported by gguf) + if data_torch.dtype == torch.bfloat16: + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + data_shape = data.shape + + # Determine if this is a linear weight suitable for ternary quantization + is_norm = gguf_name.endswith("_norm.weight") or gguf_name.endswith("_norm_in.weight") + is_embed = gguf_name == "token_embd.weight" + is_linear_weight = n_dims == 2 and not is_norm and not is_embed + suit_i2 = is_linear_weight + + if args.outtype == "i2_s" and suit_i2: + # --- I2_S ternary packing (scale embedded in data) --- + packed = quantize_to_i2_s(data) + data_qtype = gguf.GGMLQuantizationType.I2_S + + shape_str = f"{{{', '.join(str(n) for n in reversed(data_shape))}}}" + logger.info(f" {gguf_name}: {list(data_shape)} {old_dtype} -> I2_S, shape = {shape_str}") + + gguf_writer.add_tensor(gguf_name, packed, raw_shape=data_shape, raw_dtype=data_qtype) + tensor_count += 1 + + elif args.outtype in ("f16", "i2_s") and (is_linear_weight or is_embed): + # 2D weight tensors (linear + embedding) -> f16 + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + else: + # norms, 1D tensors + if args.outtype in ("f16", "i2_s"): + data = data.astype(np.float16) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float16") + else: + if data.dtype != np.float32: + data = data.astype(np.float32) + logger.info(f" {gguf_name}: {list(data_torch.shape)} {old_dtype} -> float32") + gguf_writer.add_tensor(gguf_name, data) + tensor_count += 1 + + logger.info(f"Total tensors written: {tensor_count}") + + # Note: output.weight (lm_head) is skipped for embedding models — + # it is not needed (no token generation) and saves ~297MB for this model. + + # Write GGUF + logger.info(f"Writing to {args.outfile}...") + gguf_writer.write_header_to_file() + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file() + gguf_writer.close() + + logger.info("Done!") + + +if __name__ == "__main__": + main() diff --git a/utils/print_f16_gguf_layers.py b/utils/print_f16_gguf_layers.py new file mode 100644 index 000000000..2b0526b15 --- /dev/null +++ b/utils/print_f16_gguf_layers.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +"""Print all layer names, shapes, and dtypes from a GGUF model file.""" + +import argparse +import re +from gguf import GGUFReader + + +def natural_sort_key(s): + """Sort strings with embedded numbers in natural (numeric) order.""" + return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', s)] + + +def main(): + parser = argparse.ArgumentParser(description="Print all layers in a GGUF model file.") + parser.add_argument( + "model_path", + nargs="?", + default="/data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet-embeddings-0.6b-f16.gguf", + help="Path to the .gguf file", + ) + args = parser.parse_args() + + reader = GGUFReader(args.model_path) + tensors = sorted(reader.tensors, key=lambda t: natural_sort_key(t.name)) + + print(f"Model: {args.model_path}") + print(f"Total layers: {len(tensors)}") + print(f"{'Index':<8}{'Layer Name':<60}{'Shape':<30}{'Dtype'}") + print("-" * 120) + + for i, t in enumerate(tensors): + print(f"{i:<8}{t.name:<60}{str(list(t.shape)):<30}{t.tensor_type.name}") + + +if __name__ == "__main__": + main() diff --git a/utils/print_i2_s_gguf_layers.py b/utils/print_i2_s_gguf_layers.py new file mode 100644 index 000000000..b0e75f980 --- /dev/null +++ b/utils/print_i2_s_gguf_layers.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +"""Print layer names, shapes, dtypes, and weight statistics/values from a GGUF model file.""" + +import argparse +import re +import numpy as np +from gguf import GGUFReader + + +def natural_sort_key(s): + """Sort strings with embedded numbers in natural (numeric) order.""" + return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', s)] + + +def main(): + parser = argparse.ArgumentParser(description="Print weight values from a GGUF model file.") + parser.add_argument( + "model_path", + nargs="?", + default="/data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/bitnet-embeddings-0.6b-f16-i2_s.gguf", + + help="Path to the .gguf file", + ) + parser.add_argument("--values", "-v", type=int, default=10, + help="Number of values to print per tensor (default: 10, 0=none)") + parser.add_argument("--stats", "-s", action="store_true", default=True, + help="Print statistics (min, max, mean, std, unique count)") + parser.add_argument("--no-stats", action="store_false", dest="stats", + help="Disable statistics") + parser.add_argument("--filter", "-f", type=str, default=None, + help="Only show tensors whose name contains this string") + parser.add_argument("--raw-bytes", "-r", type=int, default=0, + help="Print first N raw bytes for quantized tensors (default: 0)") + args = parser.parse_args() + + reader = GGUFReader(args.model_path) + # Open file for direct raw reads (GGUFReader may truncate quantized tensor data) + model_file = open(args.model_path, 'rb') + tensors = sorted(reader.tensors, key=lambda t: natural_sort_key(t.name)) + + if args.filter: + tensors = [t for t in tensors if args.filter in t.name] + + print(f"Model: {args.model_path}") + print(f"Total tensors: {len(tensors)}") + print("=" * 120) + + for i, t in enumerate(tensors): + dtype_name = t.tensor_type.name + shape = list(t.shape) + n_elements = int(np.prod(shape)) if shape else 0 + data_size = t.n_bytes if hasattr(t, 'n_bytes') else len(t.data) + + print(f"\n[{i}] {t.name}") + print(f" Shape: {shape} | Type: {dtype_name} | Elements: {n_elements:,} | Data size: {data_size:,} bytes") + + # For non-quantized types, we can read values directly + is_quantized = dtype_name in ("I2_S", "TL1", "TL2", "Q4_0", "Q4_1", "Q5_0", "Q5_1", + "Q8_0", "Q8_1", "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", + "IQ2_XXS", "IQ2_XS", "IQ3_XXS", "IQ1_S", "IQ4_NL", + "IQ3_S", "IQ2_S", "IQ4_XS", "TQ1_0", "TQ2_0") + + if not is_quantized: + # Float types: F16, F32, BF16 + data = t.data.copy() + if dtype_name == "F16": + values = data.view(np.float16).astype(np.float32) + elif dtype_name == "F32": + values = data.view(np.float32) + elif dtype_name == "BF16": + # BF16: pad to float32 + raw = data.view(np.uint16) + values = np.left_shift(raw.astype(np.uint32), 16).view(np.float32) + else: + values = data.view(np.float32) + + if args.stats and len(values) > 0: + unique_count = len(np.unique(values)) + print(f" Min: {values.min():.6f} | Max: {values.max():.6f} | " + f"Mean: {values.mean():.6f} | Std: {values.std():.6f} | " + f"Unique: {unique_count:,}") + + if args.values > 0 and len(values) > 0: + n = min(args.values, len(values)) + print(f" First {n} values: {values[:n].tolist()}") + + else: + # Quantized types: read raw bytes directly from file + model_file.seek(t.data_offset) + raw = np.frombuffer(model_file.read(t.n_bytes), dtype=np.uint8) + + if dtype_name == "I2_S" and len(raw) > 0: + # I2_S: packed 2-bit ternary + packed_size = n_elements // 4 + # Scale may be at packed_size offset (if extra bytes present) or absent + scale = None + if len(raw) > packed_size + 4: + scale = np.frombuffer(raw[packed_size:packed_size + 4], dtype=np.float32)[0] + print(f" I2_S scale: {scale:.6f}") + else: + print(f" I2_S: no embedded scale (data = packed bytes only)") + + # Decode first few values + map2bit = [-1.0, 0.0, 1.0, 0.0] + n_show = min(args.values, n_elements) if args.values > 0 else 0 + if n_show > 0: + decoded = [] + for byte_idx in range(min(n_show // 4 + 1, packed_size)): + b = raw[byte_idx] + decoded.append(map2bit[(b >> 6) & 3]) + decoded.append(map2bit[(b >> 4) & 3]) + decoded.append(map2bit[(b >> 2) & 3]) + decoded.append(map2bit[b & 3]) + if scale: + decoded = [v * scale for v in decoded] + print(f" First {n_show} decoded values: {decoded[:n_show]}") + + # Distribution stats + total_vals = min(packed_size * 4, n_elements) + counts = {-1: 0, 0: 0, 1: 0} + for byte_idx in range(packed_size): + b = raw[byte_idx] + for shift in [6, 4, 2, 0]: + v = (b >> shift) & 3 + if v == 0: + counts[-1] += 1 + elif v == 1: + counts[0] += 1 + elif v == 2: + counts[1] += 1 + print(f" Ternary distribution: " + f"-1: {counts[-1]:,} ({100*counts[-1]/total_vals:.1f}%) | " + f" 0: {counts[0]:,} ({100*counts[0]/total_vals:.1f}%) | " + f"+1: {counts[1]:,} ({100*counts[1]/total_vals:.1f}%)") + + elif dtype_name == "Q6_K" and len(raw) > 0: + # Q6_K: 210 bytes per 256 elements + block_size = 210 + n_blocks = len(raw) // block_size + print(f" Q6_K blocks: {n_blocks}") + if n_blocks > 0: + # Read d (fp16) from first block (last 2 bytes) + d = np.frombuffer(raw[208:210], dtype=np.float16).astype(np.float32)[0] + print(f" First block d (scale): {d:.6f}") + + else: + print(f" [Quantized tensor, raw decode not implemented for {dtype_name}]") + + if args.raw_bytes > 0 and len(raw) > 0: + n = min(args.raw_bytes, len(raw)) + hex_str = ' '.join(f'{b:02x}' for b in raw[:n]) + print(f" First {n} raw bytes: {hex_str}") + + print(f"\n{'=' * 120}") + print(f"Total: {len(tensors)} tensors") + model_file.close() + + +if __name__ == "__main__": + main() diff --git a/utils/print_safetensors_layers.py b/utils/print_safetensors_layers.py new file mode 100644 index 000000000..90e220817 --- /dev/null +++ b/utils/print_safetensors_layers.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +"""Print all layer names, shapes, and dtypes from a safetensors model file.""" + +import argparse +import re +from safetensors import safe_open + + +def natural_sort_key(s): + """Sort strings with embedded numbers in natural (numeric) order.""" + return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', s)] + + +def main(): + parser = argparse.ArgumentParser(description="Print all layers in a safetensors model file.") + parser.add_argument( + "model_path", + nargs="?", + default="/data2/huangxin/model_list/microsoft_release_multilingual_models/bitnet-embeddings-0.6b/model.safetensors", + help="Path to the model.safetensors file", + ) + args = parser.parse_args() + + f = safe_open(args.model_path, framework="pt") + keys = sorted(f.keys(), key=natural_sort_key) + + print(f"Model: {args.model_path}") + print(f"Total layers: {len(keys)}") + print(f"{'Index':<8}{'Layer Name':<60}{'Shape':<30}{'Dtype'}") + print("-" * 120) + + for i, key in enumerate(keys): + tensor = f.get_tensor(key) + unique_count = tensor.unique().numel() + print(f"{i:<8}{key:<60}{str(list(tensor.shape)):<30}{tensor.dtype}") + # print(f" Min: {tensor.min().item():.6f}, Max: {tensor.max().item():.6f}, " + # f"Mean: {tensor.float().mean().item():.6f}, Unique: {unique_count}") + # vals = tensor.flatten()[:10].tolist() + # print(f" First 10 values: {vals}") + # print() + + +if __name__ == "__main__": + main()