-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaffiliated.bib
More file actions
1096 lines (1055 loc) · 71.7 KB
/
affiliated.bib
File metadata and controls
1096 lines (1055 loc) · 71.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{liu-etal-2024-multilingual,
title = "Are Multilingual {LLM}s Culturally-Diverse Reasoners? An Investigation into Multicultural Proverbs and Sayings",
venue = "NAACL",
author = "Cecilia Liu, Chen and
Koto, Fajri and
Baldwin, Timothy and
Gurevych, Iryna",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.112/",
doi = "10.18653/v1/2024.naacl-long.112",
pages = "2016--2039",
abstract = "Large language models (LLMs) are highly adept at question answering and reasoning tasks, but when reasoning in a situational context, human expectations vary depending on the relevant cultural common ground. As languages are associated with diverse cultures, LLMs should also be culturally-diverse reasoners. In this paper, we study the ability of a wide range of state-of-the-art multilingual LLMs (mLLMs) to reason with proverbs and sayings in a conversational context. Our experiments reveal that: (1) mLLMs ``know'' limited proverbs and memorizing proverbs does not mean understanding them within a conversational context; (2) mLLMs struggle to reason with figurative proverbs and sayings, and when asked to select the wrong answer (instead of asking it to select the correct answer); and (3) there is a ``culture gap'' in mLLMs when reasoning about proverbs and sayings translated from other languages. We construct and release our evaluation dataset MAPS (MulticulturAl Proverbs and Sayings) for proverb understanding with conversational context for six different languages."
}
@inproceedings{cahyawijaya-etal-2024-cendol,
title = "Cendol: Open Instruction-tuned Generative Large Language Models for {I}ndonesian Languages",
venue = "ACL",
author = "Cahyawijaya, Samuel and
Lovenia, Holy and
Koto, Fajri and
Putri, Rifki and
Cenggoro, Wawan and
Lee, Jhonson and
Akbar, Salsabil and
Dave, Emmanuel and
Nuurshadieq, Nuurshadieq and
Mahendra, Muhammad and
Putri, Rr and
Wilie, Bryan and
Winata, Genta and
Aji, Alham and
Purwarianti, Ayu and
Fung, Pascale",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.796/",
doi = "10.18653/v1/2024.acl-long.796",
pages = "14899--14914",
abstract = "Large language models (LLMs) show remarkable human-like capability in various domains and languages. To bridge this quality gap, we introduce Cendol, a collection of Indonesian LLMs encompassing both decoder-only and encoder-decoder architectures across a range of model sizes. We highlight Cendol{'}s effectiveness across a diverse array of tasks, attaining {\textasciitilde}20{\%} improvement, and demonstrate its capability to generalize to unseen tasks and indigenous languages of Indonesia. Furthermore, Cendol models showcase improved human favorability despite their limitations in capturing indigenous knowledge and cultural values in Indonesia. In addition, we discuss the shortcomings of parameter-efficient tunings, such as LoRA, for language adaptation. Alternatively, we propose the usage of vocabulary adaptation to enhance efficiency. Lastly, we evaluate the safety of Cendol and showcase that safety in pre-training in one language such as English is transferable to low-resource languages, such as Indonesian, even without RLHF and safety fine-tuning."
}
@inproceedings{wibowo-etal-2024-copal,
title = "{COPAL}-{ID}: {I}ndonesian Language Reasoning with Local Culture and Nuances",
venue = "ACL",
author = "Wibowo, Haryo and
Fuadi, Erland and
Nityasya, Made and
Prasojo, Radityo Eko and
Aji, Alham",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.77/",
doi = "10.18653/v1/2024.naacl-long.77",
pages = "1404--1422",
abstract = "We present COPAL-ID, a novel, public Indonesian language common sense reasoning dataset. Unlike the previous Indonesian COPA dataset (XCOPA-ID), COPAL-ID incorporates Indonesian local and cultural nuances, and therefore, provides a more natural portrayal of day-to-day causal reasoning within the Indonesian cultural sphere. Professionally written by natives from scratch, COPAL-ID is more fluent and free from awkward phrases, unlike the translated XCOPA-ID. In addition, we present COPALID in both standard Indonesian and in Jakartan Indonesian{--}a dialect commonly used in daily conversation. COPAL-ID poses a greater challenge for existing open-sourced and closedstate-of-the-art multilingual language models, yet is trivially easy for humans. Our findings suggest that general multilingual models struggle to perform well, achieving 66.91{\%} accuracy on COPAL-ID. South-East Asian-specific models achieve slightly better performance of 73.88{\%} accuracy. Yet, this number still falls short of near-perfect human performance. This shows that these language models are still way behind in comprehending the local nuances of Indonesian."
}
@inproceedings{koto-2025-cracking,
title = "Cracking the Code: Multi-domain {LLM} Evaluation on Real-World Professional Exams in {I}ndonesia",
venue = "NAACL",
author = "Koto, Fajri",
editor = "Chen, Weizhu and
Yang, Yi and
Kachuee, Mohammad and
Fu, Xue-Yong",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-industry.69/",
doi = "10.18653/v1/2025.naacl-industry.69",
pages = "938--948",
ISBN = "979-8-89176-194-0",
abstract = "While knowledge evaluation in large language models has predominantly focused on academic subjects like math and physics, these assessments often fail to capture the practical demands of real-world professions. In this paper, we introduce IndoCareer, a dataset comprising 8,834 multiple-choice questions designed to evaluate performance in vocational and professional certification exams across various fields. With a focus on Indonesia, IndoCareer provides rich local contexts, spanning six key sectors: (1) healthcare, (2) insurance and finance, (3) creative and design, (4) tourism and hospitality, (5) education and training, and (6) law. Our comprehensive evaluation of 27 large language models shows that these models struggle particularly in fields with strong local contexts, such as insurance and finance. Additionally, while using the entire dataset, shuffling answer options generally maintains consistent evaluation results across models, but it introduces instability specifically in the insurance and finance sectors."
}
@inproceedings{koto-etal-2024-zero,
title = "Zero-shot Sentiment Analysis in Low-Resource Languages Using a Multilingual Sentiment Lexicon",
venue = "EACL",
author = "Koto, Fajri and
Beck, Tilman and
Talat, Zeerak and
Gurevych, Iryna and
Baldwin, Timothy",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-long.18/",
doi = "10.18653/v1/2024.eacl-long.18",
pages = "298--320",
abstract = "Improving multilingual language models capabilities in low-resource languages is generally difficult due to the scarcity of large-scale data in those languages. In this paper, we relax the reliance on texts in low-resource languages by using multilingual lexicons in pretraining to enhance multilingual capabilities. Specifically, we focus on zero-shot sentiment analysis tasks across 34 languages, including 6 high/medium-resource languages, 25 low-resource languages, and 3 code-switching datasets. We demonstrate that pretraining using multilingual lexicons, without using any sentence-level sentiment data, achieves superior zero-shot performance compared to models fine-tuned on English sentiment datasets, and large language models like GPT{--}3.5, BLOOMZ, and XGLM. These findings are observable for unseen low-resource languages to code-mixed scenarios involving high-resource languages."
}
@inproceedings{ustun-etal-2024-aya,
title = "Aya Model: An Instruction Finetuned Open-Access Multilingual Language Model",
venue = "ACL",
author = {{\"U}st{\"u}n, Ahmet and
Aryabumi, Viraat and
Yong, Zheng and
Ko, Wei-Yin and
D{'}souza, Daniel and
Onilude, Gbemileke and
Bhandari, Neel and
Singh, Shivalika and
Ooi, Hui-Lee and
Kayid, Amr and
Vargus, Freddie and
Blunsom, Phil and
Longpre, Shayne and
Muennighoff, Niklas and
Fadaee, Marzieh and
Kreutzer, Julia and
Hooker, Sara},
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.845/",
doi = "10.18653/v1/2024.acl-long.845",
pages = "15894--15939",
abstract = "Recent breakthroughs in large language models (LLMs) have centered around a handful of data-rich languages. What does it take to broaden access to breakthroughs beyond first-class citizen languages? Our work introduces Aya, a massively multilingual generative language model that follows instructions in 101 languages of which over 50{\%} are considered as lower-resourced. Aya outperforms mT0 and BLOOMZ on the majority of tasks while covering double the number of languages. We introduce extensive new evaluation suites that broaden the state-of-art for multilingual eval across 99 languages {---}{---} including discriminative and generative tasks, human evaluation, and simulated win rates that cover both held-out tasks and in-distribution performance. Furthermore, we conduct detailed investigations on the optimal finetuning mixture composition, data pruning, as well as the toxicity, bias, and safety of our models."
}
@inproceedings{payoungkhamdee-etal-2024-empirical,
title = "An Empirical Study of Multilingual Reasoning Distillation for Question Answering",
venue = "EMNLP",
author = "Payoungkhamdee, Patomporn and
Limkonchotiwat, Peerat and
Baek, Jinheon and
Manakul, Potsawee and
Udomcharoenchaikit, Can and
Chuangsuwanich, Ekapol and
Nutanong, Sarana",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.442/",
doi = "10.18653/v1/2024.emnlp-main.442",
pages = "7739--7751",
abstract = "Reasoning is one crucial capability in Large Language Models (LLMs), allowing them to perform complex tasks such as solving math problems and multi-step planning. While reasoning capability can emerge in larger models, smaller ones usually have to rely on distillation to transfer this capability from a larger model. However, recent efforts to distill reasoning capabilities have focused mainly on English, leaving multilingual distillation underexplored. To address this gap, this paper examines existing English reasoning distillation methods that utilize a variety of positive rationales in multilingual settings and proposes d-CoT-nR, a novel approach that incorporates incorrect rationales as additional guidance. Empirical results from multilingual high-school examinations show that d-CoT-nR significantly surpasses the baseline, improving accuracy in unseen languages and correctness in step-by-step reasoning."
}
@inproceedings{phatthiyaphaibun-etal-2024-chie,
title = "{CHIE}: Generative {MRC} Evaluation for in-context {QA} with Correctness, Helpfulness, Irrelevancy, and Extraneousness Aspects",
venue = "GenBench @ EMNLP",
author = "Phatthiyaphaibun, Wannaphong and
Nonesung, Surapon and
Limkonchotiwat, Peerat and
Udomcharoenchaikit, Can and
Sawatphol, Jitkapat and
Chuangsuwanich, Ekapol and
Nutanong, Sarana",
editor = "Hupkes, Dieuwke and
Dankers, Verna and
Batsuren, Khuyagbaatar and
Kazemnejad, Amirhossein and
Christodoulopoulos, Christos and
Giulianelli, Mario and
Cotterell, Ryan",
booktitle = "Proceedings of the 2nd GenBench Workshop on Generalisation (Benchmarking) in NLP",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.genbench-1.10/",
doi = "10.18653/v1/2024.genbench-1.10",
pages = "154--164",
abstract = "The evaluation of generative models in Machine Reading Comprehension (MRC) presents distinct difficulties, as traditional metrics like BLEU, ROUGE, METEOR, Exact Match, and F1 score often struggle to capture the nuanced and diverse responses. While embedding-based metrics such as BERTScore and BARTScore focus on semantic similarity, they still fail to fully address aspects such as recognizing additional helpful information and rewarding contextual faithfulness. Recent advances in large language model (LLM) based metrics offer more fine-grained evaluations, but challenges such as score clustering remain. This paper introduces a multi-aspect evaluation framework, CHIE,incorporating aspects of \textbf{C}orrectness, \textbf{H}elpfulness, \textbf{I}rrelevance, and \textbf{E}xtraneousness. Our approach, which uses binary categorical values rather than continuous rating scales, aligns well with human judgments, indicating its potential as a comprehensive and effective evaluation method."
}
@inproceedings{zhang-eickhoff-2024-crocosum,
title = "{C}ro{C}o{S}um: A Benchmark Dataset for Cross-Lingual Code-Switched Summarization",
venue = "LREC-COLING",
author = "Zhang, Ruochen and
Eickhoff, Carsten",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.367/",
pages = "4113--4126",
abstract = "Cross-lingual summarization (CLS) has attracted increasing interest in recent years due to the availability of large-scale web-mined datasets and the advancements of multilingual language models. However, given the rareness of naturally occurring CLS resources, the majority of datasets are forced to rely on translation which can contain overly literal artifacts. This restricts our ability to observe naturally occurring CLS pairs that capture organic diction, including instances of code-switching. This alteration between languages in mid-message is a common phenomenon in multilingual settings yet has been largely overlooked in cross-lingual contexts due to data scarcity. To address this gap, we introduce CroCoSum, a dataset of cross-lingual code-switched summarization of technology news. It consists of over 24,000 English source articles and 18,000 human-written Chinese news summaries, with more than 92{\%} of the summaries containing code-switched phrases. For reference, we evaluate the performance of existing approaches including pipeline, end-to-end, and zero-shot methods. We show that leveraging existing CLS resources as a pretraining step does not improve performance on CroCoSum, indicating the limited generalizability of current datasets. Finally, we discuss the challenges of evaluating cross-lingual summarizers on code-switched generation through qualitative error analyses."
}
@inproceedings{hudi-etal-2024-disentangling,
title = "Disentangling Pretrained Representation to Leverage Low-Resource Languages in Multilingual Machine Translation",
venue = "LREC-COLING",
author = "Hudi, Frederikus and
Qu, Zhi and
Kamigaito, Hidetaka and
Watanabe, Taro",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.446/",
pages = "4978--4989",
abstract = "Multilingual neural machine translation aims to encapsulate multiple languages into a single model. However, it requires an enormous dataset, leaving the low-resource language (LRL) underdeveloped. As LRLs may benefit from shared knowledge of multilingual representation, we aspire to find effective ways to integrate unseen languages in a pre-trained model. Nevertheless, the intricacy of shared representation among languages hinders its full utilisation. To resolve this problem, we employed target language prediction and a central language-aware layer to improve representation in integrating LRLs. Focusing on improving LRLs in the linguistically diverse country of Indonesia, we evaluated five languages using a parallel corpus of 1,000 instances each, with experimental results measured by BLEU showing zero-shot improvement of 7.4 from the baseline score of 7.1 to a score of 15.5 at best. Further analysis showed that the gains in performance are attributed more to the disentanglement of multilingual representation in the encoder with the shift of the target language-specific representation in the decoder."
}
@inproceedings{laosaengpha-etal-2024-learning,
title = "Learning Job Title Representation from Job Description Aggregation Network",
venue = "ACL Findings",
author = "Laosaengpha, Napat and
Tativannarat, Thanit and
Piansaddhayanon, Chawan and
Rutherford, Attapol and
Chuangsuwanich, Ekapol",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.77/",
doi = "10.18653/v1/2024.findings-acl.77",
pages = "1319--1329",
abstract = "Learning job title representation is a vital process for developing automatic human resource tools. To do so, existing methods primarily rely on learning the title representation through skills extracted from the job description, neglecting the rich and diverse content within. Thus, we propose an alternative framework for learning job titles through their respective job description (JD) and utilize a Job Description Aggregator component to handle the lengthy description and bidirectional contrastive loss to account for the bidirectional relationship between the job title and its description. We evaluated the performance of our method on both in-domain and out-of-domain settings, achieving a superior performance over the skill-based approach."
}
@inproceedings{yong-etal-2024-lexc,
title = "{L}ex{C}-Gen: Generating Data for Extremely Low-Resource Languages with Large Language Models and Bilingual Lexicons",
venue = "EMNLP Findings",
author = "Yong, Zheng Xin and
Menghini, Cristina and
Bach, Stephen",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.818/",
doi = "10.18653/v1/2024.findings-emnlp.818",
pages = "13990--14009",
abstract = "Data scarcity in low-resource languages can be addressed with word-to-word translations from labeled task data in high-resource languages using bilingual lexicons. However, bilingual lexicons often have limited lexical overlap with task data, which results in poor translation coverage and lexicon utilization. We propose lexicon-conditioned data generation LexC-Gen, a method that generates low-resource-language classification task data at scale. Specifically, LexC-Gen first uses high-resource-language words from bilingual lexicons to generate lexicon-compatible task data, and then it translates them into low-resource languages with bilingual lexicons via word translation. Across 17 extremely low-resource languages, LexC-Gen generated data is competitive with expert-translated gold data, and yields on average 5.6 and 8.9 points improvement over existing lexicon-based word translation methods on sentiment analysis and topic classification tasks respectively. Through ablation study, we show that conditioning on bilingual lexicons is the key component of LexC-Gen. LexC-Gen serves as a potential solution to close the performance gap between open-source multilingual models, such as BLOOMZ and Aya-101, and state-of-the-art commercial models like GPT-4o on low-resource-language tasks."
}
@inproceedings{limkonchotiwat-etal-2024-mccrolin,
title = "{M}c{C}rolin: Multi-consistency Cross-lingual Training for Retrieval Question Answering",
venue = "EMNLP Findings",
author = "Limkonchotiwat, Peerat and
Ponwitayarat, Wuttikorn and
Lowphansirikul, Lalita and
Manakul, Potsawee and
Udomcharoenchaikit, Can and
Chuangsuwanich, Ekapol and
Nutanong, Sarana",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.157/",
doi = "10.18653/v1/2024.findings-emnlp.157",
pages = "2780--2793",
abstract = "Automated question answering (QA) systems are increasingly relying on robust cross-lingual retrieval to identify and utilize information from multilingual sources, ensuring comprehensive and contextually accurate responses. Existing approaches often struggle with consistency across multiple languages and multi-size input scenarios. To address these challenges, we propose McCrolin, a Multi-consistency Cross-lingual training framework, leveraging multi-task learning to enhance cross-lingual consistency, ranking stability, and input-size robustness. Experimental results demonstrate that McCrolin achieves state-of-the-art performance on standard cross-lingual retrieval QA datasets. Furthermore, McCrolin outperforms competitors when dealing with various input sizes on downstream tasks. In terms of generalizability, results from further analysis show that our method is effective for various encoder architectures and sizes."
}
@inproceedings{pengpun-etal-2024-creating,
title = "On Creating an {E}nglish-{T}hai Code-switched Machine Translation in Medical Domain",
venue = "EMNLP Findings",
author = "Pengpun, Parinthapat and
Tiankanon, Krittamate and
Chinkamol, Amrest and
Kinchagawat, Jiramet and
Chairuengjitjaras, Pitchaya and
Supholkhan, Pasit and
Aussavavirojekul, Pubordee and
Boonnag, Chiraphat and
Veerakanjana, Kanyakorn and
Phimsiri, Hirunkul and
Sae-jia, Boonthicha and
Sataudom, Nattawach and
Ittichaiwong, Piyalitt and
Limkonchotiwat, Peerat",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.351/",
doi = "10.18653/v1/2024.findings-emnlp.351",
pages = "6055--6073",
abstract = "Machine translation (MT) in the medical domain plays a pivotal role in enhancing healthcare quality and disseminating medical knowledge. Despite advancements in English-Thai MT technology, common MT approaches often underperform in the medical field due to their inability to precisely translate medical terminologies. Our research prioritizes not merely improving translation accuracy but also maintaining medical terminology in English within the translated text through code-switched (CS) translation. We developed a method to produce CS medical translation data, fine-tuned a CS translation model with this data, and evaluated its performance against strong baselines, such as Google Neural Machine Translation (NMT) and GPT-3.5/GPT-4. Our model demonstrated competitive performance in automatic metrics and was highly favored in human preference evaluations. Our evaluation result also shows that medical professionals significantly prefer CS translations that maintain critical English terms accurately, even if it slightly compromises fluency. Our code and test set are publicly available https://github.com/preceptorai-org/NLLB{\_}CS{\_}EM{\_}NLP2024."
}
@inproceedings{li-etal-2024-preference,
title = "Preference Tuning For Toxicity Mitigation Generalizes Across Languages",
venue = "EMNLP Findings",
author = "Li, Xiaochen and
Yong, Zheng Xin and
Bach, Stephen",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.784/",
doi = "10.18653/v1/2024.findings-emnlp.784",
pages = "13422--13440",
abstract = "Detoxifying multilingual Large Language Models (LLMs) has become crucial due to their increasing global use. In this work, we explore zero-shot cross-lingual generalization of preference tuning in detoxifying LLMs. Unlike previous studies that show limited cross-lingual generalization for other safety tasks, we demonstrate that Direct Preference Optimization (DPO) training with only English data can significantly reduce toxicity in multilingual open-ended generations. For example, the probability of mGPT-1.3B generating toxic continuations drops from 46.8{\%} to 3.9{\%} across 17 different languages after training. Our results also extend to other multilingual LLMs, such as BLOOM, Llama3, and Aya-23. Using mechanistic interpretability tools like causal intervention and activation analysis, we identified the dual multilinguality property of MLP layers in LLMs, which explains the cross-lingual generalization of DPO. Finally, we show that bilingual sentence retrieval can predict the cross-lingual transferability of DPO preference tuning."
}
@inproceedings{forde-etal-2024-evaluating,
title = "Re-Evaluating Evaluation for Multilingual Summarization",
venue = "EMNLP",
author = "Forde, Jessica Zosa and
Zhang, Ruochen and
Sutawika, Lintang and
Aji, Alham Fikri and
Cahyawijaya, Samuel and
Winata, Genta Indra and
Wu, Minghao and
Eickhoff, Carsten and
Biderman, Stella and
Pavlick, Ellie",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1085/",
doi = "10.18653/v1/2024.emnlp-main.1085",
pages = "19476--19493",
abstract = "Automatic evaluation approaches (ROUGE, BERTScore, LLM-based evaluators) have been widely used to evaluate summarization tasks. Despite the complexities of script differences and tokenization, these approaches have been indiscriminately applied to summarization across multiple languages. While previous works have argued that these approaches correlate strongly with human ratings in English, it remains unclear whether the conclusion holds for other languages. To answer this question, we construct a small-scale pilot dataset containing article-summary pairs and human ratings in English, Chinese and Indonesian. To measure the strength of summaries, our ratings are measured as head-to-head comparisons with resulting Elo scores across four dimensions. Our analysis reveals that standard metrics are unreliable measures of quality, and that these problems are exacerbated in Chinese and Indonesian. We advocate for more nuanced and careful considerations in designing a robust evaluation framework for multiple languages."
}
@inproceedings{urailertprasert-etal-2024-sea,
title = "{SEA}-{VQA}: {S}outheast {A}sian Cultural Context Dataset For Visual Question Answering",
venue = "ALVR @ ACL",
author = "Urailertprasert, Norawit and
Limkonchotiwat, Peerat and
Suwajanakorn, Supasorn and
Nutanong, Sarana",
editor = "Gu, Jing and
Fu, Tsu-Jui (Ray) and
Hudson, Drew and
Celikyilmaz, Asli and
Wang, William",
booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alvr-1.15/",
doi = "10.18653/v1/2024.alvr-1.15",
pages = "173--185",
abstract = "Visual Question Answering (VQA) is a critical task that requires the simultaneous understanding of visual and textual information. While significant advancements have been made with multilingual datasets, these often lack cultural specificity, especially in the context of Southeast Asia (SEA). In this paper, we introduce SEA-VQA aiming to highlight the challenges and gaps in existing VQA models when confronted with culturally specific content. Our dataset includes images from eight SEA countries, curated from the UNESCO Cultural Heritage collection. Our evaluation, comparing GPT-4 and GEMINI models, demonstrates substantial performance drops on culture-centric questions compared to the A-OKVQA dataset, a commonsense and world-knowledge VQA benchmark comprising approximately 25,000 questions. Our findings underscore the importance of cultural diversity in VQA datasets and reveal substantial gaps in the ability of current VQA models to handle culturally rich contexts. SEA-VQA serves as a crucial benchmark for identifying these gaps and guiding future improvements in VQA systems."
}
@inproceedings{adilazuarda-etal-2022-indorobusta,
title = "{I}ndo{R}obusta: Towards Robustness Against Diverse Code-Mixed {I}ndonesian Local Languages",
venue = "SUMEval @ ACL",
author = "Adilazuarda, Muhammad Farid and
Cahyawijaya, Samuel and
Winata, Genta Indra and
Fung, Pascale and
Purwarianti, Ayu",
editor = "Ahuja, Kabir and
Anastasopoulos, Antonios and
Patra, Barun and
Neubig, Graham and
Choudhury, Monojit and
Dandapat, Sandipan and
Sitaram, Sunayana and
Chaudhary, Vishrav",
booktitle = "Proceedings of the First Workshop on Scaling Up Multilingual Evaluation",
month = nov,
year = "2022",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.sumeval-1.5/",
doi = "10.18653/v1/2022.sumeval-1.5",
pages = "25--34",
abstract = "Significant progress has been made on Indone-
sian NLP. Nevertheless, exploration of the code-
mixing phenomenon in Indonesian is limited,
despite many languages being frequently mixed
with Indonesian in daily conversation. In this
work, we explore code-mixing in Indonesian
with four embedded languages, i.e., English,
Sundanese, Javanese, and Malay; and intro-
duce IndoRobusta1, a framework to evalu-
ate and improve the code-mixing robustness.
Our analysis shows that the pre-training cor-
pus bias affects the model’s ability to better
handle Indonesian-English code-mixing when
compared to other local languages, despite hav-
ing higher language diversity"
}
@inproceedings{koto-etal-2023-large,
title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}",
venue = "EMNLP",
author = "Koto, Fajri and
Aisyah, Nurul and
Li, Haonan and
Baldwin, Timothy",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.760/",
doi = "10.18653/v1/2023.emnlp-main.760",
pages = "12359--12374",
abstract = "Although large language models (LLMs) are often pre-trained on large-scale multilingual texts, their reasoning abilities and real-world knowledge are mainly evaluated based on English datasets. Assessing LLM capabilities beyond English is increasingly vital but hindered due to the lack of suitable datasets. In this work, we introduce IndoMMLU, the first multi-task language understanding benchmark for Indonesian culture and languages, which consists of questions from primary school to university entrance exams in Indonesia. By employing professional teachers, we obtain 14,981 questions across 64 tasks and education levels, with 46{\%} of the questions focusing on assessing proficiency in the Indonesian language and knowledge of nine local languages and cultures in Indonesia. Our empirical evaluations show that GPT-3.5 only manages to pass the Indonesian primary school level, with limited knowledge of local Indonesian languages and culture. Other smaller models such as BLOOMZ and Falcon perform at even lower levels."
}
@inproceedings{zhang-etal-2023-multilingual,
title = "Multilingual Large Language Models Are Not (Yet) Code-Switchers",
venue = "EMNLP",
author = "Zhang, Ruochen and
Cahyawijaya, Samuel and
Cruz, Jan Christian Blaise and
Winata, Genta and
Aji, Alham Fikri",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.774/",
doi = "10.18653/v1/2023.emnlp-main.774",
pages = "12567--12582",
abstract = "Multilingual Large Language Models (LLMs) have recently shown great capabilities in a wide range of tasks, exhibiting state-of-the-art performance through zero-shot or few-shot prompting methods. While there have been extensive studies on their abilities in monolingual tasks, the investigation of their potential in the context of code-switching (CSW), the practice of alternating languages within an utterance, remains relatively uncharted. In this paper, we provide a comprehensive empirical analysis of various multilingual LLMs, benchmarking their performance across four tasks: sentiment analysis, machine translation, summarization and word-level language identification. Our results indicate that despite multilingual LLMs exhibiting promising outcomes in certain tasks using zero or few-shot prompting, they still underperform in comparison to fine-tuned models of much smaller scales. We argue that current ``multilingualism' in LLMs does not inherently imply proficiency with code-switching texts, calling for future research to bridge this discrepancy."
}
@inproceedings{cahyawijaya-etal-2023-nusacrowd,
title = "{N}usa{C}rowd: Open Source Initiative for {I}ndonesian {NLP} Resources",
venue = "ACL Findings",
author = "Cahyawijaya, Samuel and
Lovenia, Holy and
Aji, Alham Fikri and
Winata, Genta and
Wilie, Bryan and
Koto, Fajri and
Mahendra, Rahmad and
Wibisono, Christian and
Romadhony, Ade and
Vincentio, Karissa and
Santoso, Jennifer and
Moeljadi, David and
Wirawan, Cahya and
Hudi, Frederikus and
Wicaksono, Muhammad Satrio and
Parmonangan, Ivan and
Alfina, Ika and
Putra, Ilham Firdausi and
Rahmadani, Samsul and
Oenang, Yulianti and
Septiandri, Ali and
Jaya, James and
Dhole, Kaustubh and
Suryani, Arie and
Putri, Rifki Afina and
Su, Dan and
Stevens, Keith and
Nityasya, Made Nindyatama and
Adilazuarda, Muhammad and
Hadiwijaya, Ryan and
Diandaru, Ryandito and
Yu, Tiezheng and
Ghifari, Vito and
Dai, Wenliang and
Xu, Yan and
Damapuspita, Dyah and
Wibowo, Haryo and
Tho, Cuk and
Karo Karo, Ichwanul and
Fatyanosa, Tirana and
Ji, Ziwei and
Neubig, Graham and
Baldwin, Timothy and
Ruder, Sebastian and
Fung, Pascale and
Sujaini, Herry and
Sakti, Sakriani and
Purwarianti, Ayu",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.868/",
doi = "10.18653/v1/2023.findings-acl.868",
pages = "13745--13818",
abstract = "We present NusaCrowd, a collaborative initiative to collect and unify existing resources for Indonesian languages, including opening access to previously non-public resources. Through this initiative, we have brought together 137 datasets and 118 standardized data loaders. The quality of the datasets has been assessed manually and automatically, and their value is demonstrated through multiple experiments.NusaCrowd{'}s data collection enables the creation of the first zero-shot benchmarks for natural language understanding and generation in Indonesian and the local languages of Indonesia. Furthermore, NusaCrowd brings the creation of the first multilingual automatic speech recognition benchmark in Indonesian and the local languages of Indonesia. Our work strives to advance natural language processing (NLP) research for languages that are under-represented despite being widely spoken."
}
@inproceedings{yong-etal-2023-prompting,
title = "Prompting Multilingual Large Language Models to Generate Code-Mixed Texts: The Case of South {E}ast {A}sian Languages",
venue = "CALCS-ACL",
author = "Yong, Zheng Xin and
Zhang, Ruochen and
Forde, Jessica and
Wang, Skyler and
Subramonian, Arjun and
Lovenia, Holy and
Cahyawijaya, Samuel and
Winata, Genta and
Sutawika, Lintang and
Cruz, Jan Christian Blaise and
Tan, Yin Lin and
Phan, Long and
Phan, Long and
Garcia, Rowena and
Solorio, Thamar and
Aji, Alham Fikri",
editor = "Winata, Genta and
Kar, Sudipta and
Zhukova, Marina and
Solorio, Thamar and
Diab, Mona and
Sitaram, Sunayana and
Choudhury, Monojit and
Bali, Kalika",
booktitle = "Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.calcs-1.5/",
pages = "43--63",
abstract = "While code-mixing is a common linguistic practice in many parts of the world, collecting high-quality and low-cost code-mixed data remains a challenge for natural language processing (NLP) research. The recent proliferation of Large Language Models (LLMs) compels one to ask: how capable are these systems in generating code-mixed data? In this paper, we explore prompting multilingual LLMs in a zero-shot manner to generate code-mixed data for seven languages in South East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese, Tamil, and Singlish. We find that publicly available multilingual instruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of producing texts with phrases or clauses from different languages. ChatGPT exhibits inconsistent capabilities in generating code-mixed texts, wherein its per-formance varies depending on the prompt template and language pairing. For instance, ChatGPT generates fluent and natural Singlish texts (an English-based creole spoken in Singapore), but for English-Tamil language pair, the system mostly produces grammatically incorrect or semantically meaningless utterances. Furthermore, it may erroneously introduce languages not specified in the prompt. Based on our investigation, existing multilingual LLMs exhibit a wide range of proficiency in code-mixed data generation for SEA languages. As such, we advise against using LLMs in this context without extensive human checks."
}
@inproceedings{velasco-etal-2023-towards,
title = "Towards Automatic Construction of {F}ilipino {W}ord{N}et: Word Sense Induction and Synset Induction Using Sentence Embeddings",
venue = "SEALP @ ACL",
author = "Velasco, Dan John and
Alba, Axel and
Pelagio, Trisha Gail and
Ramirez, Bryce Anthony and
Cruz, Jan Christian Blaise and
Chua, Unisse and
Samson, Briane Paul and
Cheng, Charibeth",
editor = "Wijaya, Derry and
Aji, Alham Fikri and
Vania, Clara and
Winata, Genta Indra and
Purwarianti, Ayu",
booktitle = "Proceedings of the First Workshop in South East Asian Language Processing",
month = nov,
year = "2023",
address = "Nusa Dua, Bali, Indonesia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.sealp-1.1/",
doi = "10.18653/v1/2023.sealp-1.1",
pages = "1--12"
}
@inproceedings{cahyawijaya-etal-2023-nusawrites,
title = "{N}usa{W}rites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages",
venue = "IJCNLP @ AACL",
author = "Cahyawijaya, Samuel and
Lovenia, Holy and
Koto, Fajri and
Adhista, Dea and
Dave, Emmanuel and
Oktavianti, Sarah and
Akbar, Salsabil and
Lee, Jhonson and
Shadieq, Nuur and
Cenggoro, Tjeng Wawan and
Linuwih, Hanung and
Wilie, Bryan and
Muridan, Galih and
Winata, Genta and
Moeljadi, David and
Aji, Alham Fikri and
Purwarianti, Ayu and
Fung, Pascale",
editor = "Park, Jong C. and
Arase, Yuki and
Hu, Baotian and
Lu, Wei and
Wijaya, Derry and
Purwarianti, Ayu and
Krisnadhi, Adila Alfa",
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = nov,
year = "2023",
address = "Nusa Dua, Bali",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.ijcnlp-main.60/",
doi = "10.18653/v1/2023.ijcnlp-main.60",
pages = "921--945"
}
@inproceedings{kautsar-etal-2023-indotod,
title = "{I}ndo{T}o{D}: A Multi-Domain {I}ndonesian Benchmark For End-to-End Task-Oriented Dialogue Systems",
venue = "SEALP @ AACL",
author = "Kautsar, Muhammad and
Nurdini, Rahmah and
Cahyawijaya, Samuel and
Winata, Genta and
Purwarianti, Ayu",
editor = "Wijaya, Derry and
Aji, Alham Fikri and
Vania, Clara and
Winata, Genta Indra and
Purwarianti, Ayu",
booktitle = "Proceedings of the First Workshop in South East Asian Language Processing",
month = nov,
year = "2023",
address = "Nusa Dua, Bali, Indonesia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.sealp-1.7/",
doi = "10.18653/v1/2023.sealp-1.7",
pages = "85--99"
}
@article{kotoIndoCultureExploringGeographically2024,
title = {{{IndoCulture}}: {{Exploring Geographically Influenced Cultural Commonsense Reasoning Across Eleven Indonesian Provinces}}},
venue = {TACL},
shorttitle = {{{IndoCulture}}},
author = {Koto, Fajri and Mahendra, Rahmad and Aisyah, Nurul and Baldwin, Timothy},
year = {2024},
month = dec,
journal = {Transactions of the Association for Computational Linguistics},
volume = {12},
pages = {1703--1719},
issn = {2307-387X},
doi = {10.1162/tacl_a_00726},
url = {https://doi.org/10.1162/tacl\_a\_00726},
urldate = {2025-09-12}
}
@misc{liBactrianXMultilingualReplicable2023,
title = {Bactrian-{{X}}: {{Multilingual Replicable Instruction-Following Models}} with {{Low-Rank Adaptation}}},
shorttitle = {Bactrian-{{X}}},
author = {Li, Haonan and Koto, Fajri and Wu, Minghao and Aji, Alham Fikri and Baldwin, Timothy},
year = {2023},
month = oct,
number = {arXiv:2305.15011},
eprint = {2305.15011},
primaryclass = {cs},
publisher = {arXiv},
doi = {10.48550/arXiv.2305.15011},
url = {http://arxiv.org/abs/2305.15011},
urldate = {2025-09-12},
archiveprefix = {arXiv},
keywords = {Computer Science - Computation and Language}
}
@inproceedings{pengpunSeedFreeSyntheticData2024,
title = {Seed-{{Free Synthetic Data Generation Framework}} for {{Instruction-Tuning LLMs}}: {{A Case Study}} in {{Thai}}},
shorttitle = {Seed-{{Free Synthetic Data Generation Framework}} for {{Instruction-Tuning LLMs}}},
venue = {Student Research Workshop @ ACL},
booktitle = {Proceedings of the 62nd {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 4: {{Student Research Workshop}})},
author = {Pengpun, Parinthapat and Udomcharoenchaikit, Can and Buaphet, Weerayut and Limkonchotiwat, Peerat},
year = {2024},
pages = {438--457},
publisher = {Association for Computational Linguistics},
address = {Bangkok, Thailand},
doi = {10.18653/v1/2024.acl-srw.38},
url = {https://aclanthology.org/2024.acl-srw.38},
urldate = {2025-09-12},
langid = {english}
}
@inproceedings{romeroCVQACulturallydiverseMultilingual2025,
title = {{{CVQA}}: Culturally-Diverse Multilingual Visual Question Answering Benchmark},
shorttitle = {{{CVQA}}},
booktitle = {Proceedings of the 38th {{International Conference}} on {{Neural Information Processing Systems}}},
author = {Romero, David and Lyu, Chenyang and Wibowo, Haryo Akbarianto and Lynn, Teresa and Hamed, Injy and Kishore, Aditya Nanda and Mandal, Aishik and Dragonetti, Alina and Abzaliev, Artem and Tonja, Atnafu Lambebo and Balcha, Bontu Fufa and Whitehouse, Chenxi and Salamea, Christian and Velasco, Dan John and Adelani, David Ifeoluwa and Le Meur, David and {Villa-Cueva}, Emilio and Koto, Fajri and Farooqui, Fauzan and Belcavello, Frederico and Batnasan, Ganzorig and Vallejo, Gisela and Caulfield, Grainne and Ivetta, Guido and Song, Haiyue and Ademtew, Henok Biadglign and Maina, Hern{\'a}n and Lovenia, Holy and Azime, Israel Abebe and Cruz, Jan Christian Blaise and Gala, Jay and Geng, Jiahui and {Ortiz-Barajas}, Jesus-German and Baek, Jinheon and Dunstan, Jocelyn and Alemany, Laura Alonso and Nagasinghe, Kumaranage Ravindu Yasas and Benotti, Luciana and D'Haro, Luis Fernando and Viridiano, Marcelo and {Estecha-Garitagoitia}, Marcos and Cabrera, Maria Camila Buitrago and {Rodr{\'i}guez-Cantelar}, Mario and Jouitteau, M{\'e}lanie and Mihaylov, Mihail and Etori, Naome and Imam, Mohamed Fazli Mohamed and Adilazuarda, Muhammad Farid and Gochoo, Munkhjargal and Otgonbold, Munkh-Erdene and Niyomugisha, Olivier and Silva, Paula M{\'o}nica and Chitale, Pranjal and Dabre, Raj and Chevi, Rendi and Zhang, Ruochen and Diandaru, Ryandito and Cahyawijaya, Samuel and G{\'o}ngora, Santiago and Jeong, Soyeong and Purkayastha, Sukannya and Kuribayashi, Tatsuki and Clifford, Teresa and Jayakumar, Thanmay and Torrent, Tiago Timponi and Ehsan, Toqeer and Araujo, Vladimir and Kementchedjhieva, Yova and Burzo, Zara and Lim, Zheng Wei and Yong, Zheng Xin and Ignat, Oana and Nwatu, Joan and Mihalcea, Rada and Solorio, Thamar and Aji, Alham Fikri},
year = {2025},
month = jun,
series = {{{NeurIPS}} '24},
volume = {37},
pages = {11479--11505},
doi = {10.5555/3737916.3738282},
url = {https://dl.acm.org/doi/10.5555/3737916.3738282},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY, USA},
web = {https://cvqa-benchmark.org/},
resources = {https://huggingface.co/datasets/afaji/cvqa},
abstract = {Visual Question Answering (VQA) is an important task in multimodal AI, and it is often used to test the ability of vision-language models to understand and reason on knowledge present in both visual and textual data. However, most of the current VQA models use datasets that are primarily focused on English and a few major world languages, with images that are typically Western-centric. While recent efforts have tried to increase the number of languages covered on VQA datasets, they still lack diversity in low-resource languages. More importantly, although these datasets often extend their linguistic range via translation or some other approaches, they usually keep images the same, resulting in narrow cultural representation. To address these limitations, we construct CVQA, a new Culturally-diverse multilingual Visual Question Answering benchmark, designed to cover a rich set of languages and cultures, where we engage native speakers and cultural experts in the data collection process. As a result, CVQA includes culturally-driven images and questions from across 30 countries on four continents, covering 31 languages with 13 scripts, providing a total of 10k questions. We then benchmark several Multimodal Large Language Models (MLLMs) on CVQA, and show that the dataset is challenging for the current state-of-the-art models. This benchmark can serve as a probing evaluation suite for assessing the cultural capability and bias of multimodal models and hopefully encourage more research efforts toward increasing cultural awareness and linguistic diversity in this field.}
}
@inproceedings{yongLowResourceLanguagesJailbreak2023,
title = {Low-{{Resource Languages Jailbreak GPT-4}}},
award = {Best Paper Award},
venue = {SoLaR @ NeurIPS},
booktitle = {Socially {{Responsible Language Modelling Research}} ({{SoLaR}}) {{Workshop}} at {{Thirty-Seventh Annual Conference}} on {{Neural Information Processing Systems}}},
author = {Yong, Zheng Xin and Menghini, Cristina and Bach, Stephen},
year = {2023},
month = nov,
address = {New Orleans, USA},
doi = {10.48550/arXiv.2310.02446},
url = {https://arxiv.org/abs/2310.02446},
pdf = {https://arxiv.org/pdf/2310.02446}
}
@inproceedings{winata-etal-2023-nusax,
title = "{N}usa{X}: Multilingual Parallel Sentiment Dataset for 10 {I}ndonesian Local Languages",
award = "EACL Outstanding Paper",
author = "Winata, Genta Indra and
Aji, Alham Fikri and
Cahyawijaya, Samuel and
Mahendra, Rahmad and
Koto, Fajri and
Romadhony, Ade and
Kurniawan, Kemal and
Moeljadi, David and
Prasojo, Radityo Eko and
Fung, Pascale and
Baldwin, Timothy and
Lau, Jey Han and
Sennrich, Rico and
Ruder, Sebastian",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
venue = "EACL",
url = "https://aclanthology.org/2023.eacl-main.57/",
doi = "10.18653/v1/2023.eacl-main.57",
pages = "815--834",
abstract = "Natural language processing (NLP) has a significant impact on society via technologies such as machine translation and search engines. Despite its success, NLP technology is only widely available for high-resource languages such as English and Chinese, while it remains inaccessible to many languages due to the unavailability of data resources and benchmarks. In this work, we focus on developing resources for languages in Indonesia. Despite being the second most linguistically diverse country, most languages in Indonesia are categorized as endangered and some are even extinct. We develop the first-ever parallel resource for 10 low-resource languages in Indonesia. Our resource includes sentiment and machine translation datasets, and bilingual lexicons. We provide extensive analyses and describe challenges for creating such resources. We hope this work can spark NLP research on Indonesian and other underrepresented languages."
}
@inproceedings{imperial-kochmar-2023-automatic,
title = "Automatic Readability Assessment for Closely Related Languages",
venue = "ACL Findings",
author = "Imperial, Joseph Marvin and
Kochmar, Ekaterina",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.331/",
doi = "10.18653/v1/2023.findings-acl.331",
pages = "5371--5386",
abstract = "In recent years, the main focus of research on automatic readability assessment (ARA) has shifted towards using expensive deep learning-based methods with the primary goal of increasing models' accuracy. This, however, is rarely applicable for low-resource languages where traditional handcrafted features are still widely used due to the lack of existing NLP tools to extract deeper linguistic representations. In this work, we take a step back from the technical component and focus on how linguistic aspects such as mutual intelligibility or degree of language relatedness can improve ARA in a low-resource setting. We collect short stories written in three languages in the Philippines{---}Tagalog, Bikol, and Cebuano{---}to train readability assessment models and explore the interaction of data and features in various cross-lingual setups. Our results show that the inclusion of CrossNGO, a novel specialized feature exploiting n-gram overlap applied to languages with high mutual intelligibility, significantly improves the performance of ARA models compared to the use of off-the-shelf large multilingual language models alone. Consequently, when both linguistic representations are combined, we achieve state-of-the-art results for Tagalog and Cebuano, and baseline scores for ARA in Bikol."
}
@inproceedings{imperial-kochmar-2023-basahacorpus,
title = "{B}asaha{C}orpus: An Expanded Linguistic Resource for Readability Assessment in {C}entral {P}hilippine Languages",
venue = "EMNLP",
author = "Imperial, Joseph Marvin and
Kochmar, Ekaterina",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.388/",
doi = "10.18653/v1/2023.emnlp-main.388",
pages = "6302--6309",
abstract = "Current research on automatic readability assessment (ARA) has focused on improving the performance of models in high-resource languages such as English. In this work, we introduce and release BasahaCorpus as part of an initiative aimed at expanding available corpora and baseline models for readability assessment in lower resource languages in the Philippines. We compiled a corpus of short fictional narratives written in Hiligaynon, Minasbate, Karay-a, and Rinconada{---}languages belonging to the Central Philippine family tree subgroup{---}to train ARA models using surface-level, syllable-pattern, and n-gram overlap features. We also propose a new hierarchical cross-lingual modeling approach that takes advantage of a language{'}s placement in the family tree to increase the amount of available training data. Our study yields encouraging results that support previous work showcasing the efficacy of cross-lingual models in low-resource settings, as well as similarities in highly informative linguistic features for mutually intelligible languages."
}
@inproceedings{yong-etal-2023-bloom,
title = "{BLOOM}+1: Adding Language Support to {BLOOM} for Zero-Shot Prompting",
venue = "ACL",
author = "Yong, Zheng Xin and
Schoelkopf, Hailey and
Muennighoff, Niklas and
Aji, Alham Fikri and
Adelani, David Ifeoluwa and
Almubarak, Khalid and
Bari, M Saiful and
Sutawika, Lintang and
Kasai, Jungo and
Baruwa, Ahmed and
Winata, Genta and
Biderman, Stella and
Raff, Edward and
Radev, Dragomir and
Nikoulina, Vassilina",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.653/",
doi = "10.18653/v1/2023.acl-long.653",
pages = "11682--11703",
abstract = "The BLOOM model is a large publicly available multilingual language model, but its pretraining was limited to 46 languages. To extend the benefits of BLOOM to other languages without incurring prohibitively large costs, it is desirable to adapt BLOOM to new languages not seen during pretraining. In this work, we apply existing language adaptation strategies to BLOOM and benchmark its zero-shot prompting performance on eight new languages in a resource-constrained setting. We find language adaptation to be effective at improving zero-shot performance in new languages. Surprisingly, we find that adapter-based finetuning is more effective than continued pretraining for large models. In addition, we discover that prompting performance is not significantly affected by language specifics, such as the writing system. It is primarily determined by the size of the language adaptation data. We also add new languages to BLOOMZ, which is a multitask finetuned version of BLOOM capable of following task instructions zero-shot. We find including a new language in the multitask fine-tuning mixture to be the most effective method to teach BLOOMZ a new language. We conclude that with sufficient training data language adaptation can generalize well to diverse languages. Our code is available at \url{https://github.com/bigscience-workshop/multilingual-modeling}."
}
@inproceedings{pilar-etal-2023-cebuaner,
title = "{C}ebua{NER}: A New Baseline {C}ebuano Named Entity Recognition Model",
venue = "PACLIC @ ACL",
author = "Pilar, Ma. Beatrice Emanuela and
Dedoroy, Dane and
Papas, Ellyza Mari and
Buenaventura, Mary Loise and
Montefalcon, Myron Darrel and
Padilla, Jay Rhald and
Imperial, Joseph Marvin and
Abisado, Mideth and
Maceda, Lany",
editor = "Huang, Chu-Ren and
Harada, Yasunari and
Kim, Jong-Bok and
Chen, Si and
Hsu, Yu-Yin and
Chersoni, Emmanuele and
A, Pranav and
Zeng, Winnie Huiheng and
Peng, Bo and
Li, Yuxi and
Li, Junlin",
booktitle = "Proceedings of the 37th Pacific Asia Conference on Language, Information and Computation",
month = dec,
year = "2023",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.paclic-1.79/",
pdf = "https://aclanthology.org/2023.paclic-1.79.pdf",
pages = "792--800",
abstract = "Despite being one of the most linguistically
diverse groups of countries, computational lin-
guistics and language processing research in
Southeast Asia has struggled to match the level
of countries from the Global North. Thus, ini-
tiatives such as open-sourcing corpora and the
development of baseline models for basic lan-
guage processing tasks are important stepping
stones to encourage the growth of research ef-
forts in the field. To answer this call, we in-
troduce CEBUANER, a new baseline model
for named entity recognition (NER) in the Ce-
buano language. Cebuano is the second most-
used native language in the Philippines with
over 20 million speakers. To build the model,
we collected and annotated over 4,000 news
articles, the largest of any work in the language,
retrieved from online local Cebuano platforms
to train algorithms such as Conditional Ran-
dom Field and Bidirectional LSTM. Our find-
ings show promising results as a new baseline
model, achieving over 70% performance on
precision, recall, and F1 across all entity tags
as well as potential efficacy in a crosslingual
setup with Tagalog."
}
@inproceedings{pengpun-etal-2023-cross,
title = "Cross-Lingual Data Augmentation For {T}hai Question-Answering",
venue = "GenBench @ EMNLP",
author = "Pengpun, Parinthapat and
Udomcharoenchaikit, Can and
Buaphet, Weerayut and
Limkonchotiwat, Peerat",
editor = "Hupkes, Dieuwke and
Dankers, Verna and
Batsuren, Khuyagbaatar and
Sinha, Koustuv and
Kazemnejad, Amirhossein and
Christodoulopoulos, Christos and
Cotterell, Ryan and
Bruni, Elia",
booktitle = "Proceedings of the 1st GenBench Workshop on (Benchmarking) Generalisation in NLP",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.genbench-1.16/",
doi = "10.18653/v1/2023.genbench-1.16",
pages = "193--203",
abstract = "This paper presents an innovative data augmentation framework with data quality control designed to enhance the robustness of Question Answering (QA) models in low-resource languages, particularly Thai. Recognizing the challenges posed by the scarcity and quality of training data, we leverage data augmentation techniques in both monolingual and cross-lingual settings. Our approach augments and enriches the original dataset, thereby increasing its linguistic diversity and robustness. We evaluate the robustness of our framework on Machine Reading Comprehension, and the experimental results illustrate the potential of data augmentation to effectively increase training data and improve model generalization in low-resource language settings, offering a promising direction for the data augmentation manner."
}
@inproceedings{muennighoff-etal-2023-crosslingual,
title = "Crosslingual Generalization through Multitask Finetuning",
author = "Muennighoff, Niklas and
Wang, Thomas and
Sutawika, Lintang and
Roberts, Adam and
Biderman, Stella and
Le Scao, Teven and
Bari, M Saiful and
Shen, Sheng and
Yong, Zheng Xin and
Schoelkopf, Hailey and
Tang, Xiangru and
Radev, Dragomir and
Aji, Alham Fikri and
Almubarak, Khalid and
Albanie, Samuel and
Alyafeai, Zaid and
Webson, Albert and
Raff, Edward and
Raffel, Colin",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
venue = "ACL",
url = "https://aclanthology.org/2023.acl-long.891/",
doi = "10.18653/v1/2023.acl-long.891",
pages = "15991--16111",
abstract = "Multitask prompted finetuning (MTF) has been shown to help large language models generalize to new tasks in a zero-shot setting, but so far explorations of MTF have focused on English data and models. We apply MTF to the pretrained multilingual BLOOM and mT5 model families to produce finetuned variants called BLOOMZ and mT0. We find finetuning large multilingual language models on English tasks with English prompts allows for task genrealization to non-English languages that appear only in the pretraining corpus. Finetuning on multilingual tasks with English prompts further improves performance on English and non-English tasks leading to various state-of-the-art zero-shot results. We also investigate finetuning on multilingual tasks with prompts that have been machine-translated from English to match the language of each dataset. We find training on these machine-translated prompts leads to better performance on human-written prompts in the respective languages. Surprisingly, we find models are capable of zero-shot generalization to tasks in languages they have never intentionally seen. We conjecture that the models are learning higher-level capabilities that are both task- and language-agnostic. In addition, we introduce xP3, a composite of supervised datasets in 46 languages with English and machine-translated prompts. Our code, datasets and models are freely available at \url{https://github.com/bigscience-workshop/xmtf}."
}
@inproceedings{miranda-2023-developing,
title = "Developing a Named Entity Recognition Dataset for {T}agalog",
venue = "SEALP @ ACL",
author = "Miranda, Lester James V.",
editor = "Wijaya, Derry and
Aji, Alham Fikri and
Vania, Clara and
Winata, Genta Indra and
Purwarianti, Ayu",
booktitle = "Proceedings of the First Workshop in South East Asian Language Processing",