seacrowd.github.io/_bibliography/affiliated.bib at main · SEACrowd/seacrowd.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{liu-etal-2024-multilingual,
    title = "Are Multilingual {LLM}s Culturally-Diverse Reasoners? An Investigation into Multicultural Proverbs and Sayings",
    venue = "NAACL",
    author = "Cecilia Liu, Chen  and
      Koto, Fajri  and
      Baldwin, Timothy  and
      Gurevych, Iryna",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.naacl-long.112/",
    doi = "10.18653/v1/2024.naacl-long.112",
    pages = "2016--2039",
    abstract = "Large language models (LLMs) are highly adept at question answering and reasoning tasks, but when reasoning in a situational context, human expectations vary depending on the relevant cultural common ground. As languages are associated with diverse cultures, LLMs should also be culturally-diverse reasoners. In this paper, we study the ability of a wide range of state-of-the-art multilingual LLMs (mLLMs) to reason with proverbs and sayings in a conversational context. Our experiments reveal that: (1) mLLMs ``know'' limited proverbs and memorizing proverbs does not mean understanding them within a conversational context; (2) mLLMs struggle to reason with figurative proverbs and sayings, and when asked to select the wrong answer (instead of asking it to select the correct answer); and (3) there is a ``culture gap'' in mLLMs when reasoning about proverbs and sayings translated from other languages. We construct and release our evaluation dataset MAPS (MulticulturAl Proverbs and Sayings) for proverb understanding with conversational context for six different languages."
}

@inproceedings{cahyawijaya-etal-2024-cendol,
    title = "Cendol: Open Instruction-tuned Generative Large Language Models for {I}ndonesian Languages",
    venue = "ACL",
    author = "Cahyawijaya, Samuel  and
      Lovenia, Holy  and
      Koto, Fajri  and
      Putri, Rifki  and
      Cenggoro, Wawan  and
      Lee, Jhonson  and
      Akbar, Salsabil  and
      Dave, Emmanuel  and
      Nuurshadieq, Nuurshadieq  and
      Mahendra, Muhammad  and
      Putri, Rr  and
      Wilie, Bryan  and
      Winata, Genta  and
      Aji, Alham  and
      Purwarianti, Ayu  and
      Fung, Pascale",
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.acl-long.796/",
    doi = "10.18653/v1/2024.acl-long.796",
    pages = "14899--14914",
    abstract = "Large language models (LLMs) show remarkable human-like capability in various domains and languages. To bridge this quality gap, we introduce Cendol, a collection of Indonesian LLMs encompassing both decoder-only and encoder-decoder architectures across a range of model sizes. We highlight Cendol{'}s effectiveness across a diverse array of tasks, attaining {\textasciitilde}20{\%} improvement, and demonstrate its capability to generalize to unseen tasks and indigenous languages of Indonesia. Furthermore, Cendol models showcase improved human favorability despite their limitations in capturing indigenous knowledge and cultural values in Indonesia. In addition, we discuss the shortcomings of parameter-efficient tunings, such as LoRA, for language adaptation. Alternatively, we propose the usage of vocabulary adaptation to enhance efficiency. Lastly, we evaluate the safety of Cendol and showcase that safety in pre-training in one language such as English is transferable to low-resource languages, such as Indonesian, even without RLHF and safety fine-tuning."
}

@inproceedings{wibowo-etal-2024-copal,
    title = "{COPAL}-{ID}: {I}ndonesian Language Reasoning with Local Culture and Nuances",
    venue = "ACL",
    author = "Wibowo, Haryo  and
      Fuadi, Erland  and
      Nityasya, Made  and
      Prasojo, Radityo Eko  and
      Aji, Alham",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.naacl-long.77/",
    doi = "10.18653/v1/2024.naacl-long.77",
    pages = "1404--1422",
    abstract = "We present COPAL-ID, a novel, public Indonesian language common sense reasoning dataset. Unlike the previous Indonesian COPA dataset (XCOPA-ID), COPAL-ID incorporates Indonesian local and cultural nuances, and therefore, provides a more natural portrayal of day-to-day causal reasoning within the Indonesian cultural sphere. Professionally written by natives from scratch, COPAL-ID is more fluent and free from awkward phrases, unlike the translated XCOPA-ID. In addition, we present COPALID in both standard Indonesian and in Jakartan Indonesian{--}a dialect commonly used in daily conversation. COPAL-ID poses a greater challenge for existing open-sourced and closedstate-of-the-art multilingual language models, yet is trivially easy for humans. Our findings suggest that general multilingual models struggle to perform well, achieving 66.91{\%} accuracy on COPAL-ID. South-East Asian-specific models achieve slightly better performance of 73.88{\%} accuracy. Yet, this number still falls short of near-perfect human performance. This shows that these language models are still way behind in comprehending the local nuances of Indonesian."
}

@inproceedings{koto-2025-cracking,
    title = "Cracking the Code: Multi-domain {LLM} Evaluation on Real-World Professional Exams in {I}ndonesia",
    venue = "NAACL",
    author = "Koto, Fajri",
    editor = "Chen, Weizhu  and
      Yang, Yi  and
      Kachuee, Mohammad  and
      Fu, Xue-Yong",
    booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
    month = apr,
    year = "2025",
    address = "Albuquerque, New Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.naacl-industry.69/",
    doi = "10.18653/v1/2025.naacl-industry.69",
    pages = "938--948",
    ISBN = "979-8-89176-194-0",
    abstract = "While knowledge evaluation in large language models has predominantly focused on academic subjects like math and physics, these assessments often fail to capture the practical demands of real-world professions. In this paper, we introduce IndoCareer, a dataset comprising 8,834 multiple-choice questions designed to evaluate performance in vocational and professional certification exams across various fields. With a focus on Indonesia, IndoCareer provides rich local contexts, spanning six key sectors: (1) healthcare, (2) insurance and finance, (3) creative and design, (4) tourism and hospitality, (5) education and training, and (6) law. Our comprehensive evaluation of 27 large language models shows that these models struggle particularly in fields with strong local contexts, such as insurance and finance. Additionally, while using the entire dataset, shuffling answer options generally maintains consistent evaluation results across models, but it introduces instability specifically in the insurance and finance sectors."
}

@inproceedings{koto-etal-2024-zero,
    title = "Zero-shot Sentiment Analysis in Low-Resource Languages Using a Multilingual Sentiment Lexicon",
    venue = "EACL",
    author = "Koto, Fajri  and
      Beck, Tilman  and
      Talat, Zeerak  and
      Gurevych, Iryna  and
      Baldwin, Timothy",
    editor = "Graham, Yvette  and
      Purver, Matthew",
    booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = mar,
    year = "2024",
    address = "St. Julian{'}s, Malta",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.eacl-long.18/",
    doi = "10.18653/v1/2024.eacl-long.18",
    pages = "298--320",
    abstract = "Improving multilingual language models capabilities in low-resource languages is generally difficult due to the scarcity of large-scale data in those languages. In this paper, we relax the reliance on texts in low-resource languages by using multilingual lexicons in pretraining to enhance multilingual capabilities. Specifically, we focus on zero-shot sentiment analysis tasks across 34 languages, including 6 high/medium-resource languages, 25 low-resource languages, and 3 code-switching datasets. We demonstrate that pretraining using multilingual lexicons, without using any sentence-level sentiment data, achieves superior zero-shot performance compared to models fine-tuned on English sentiment datasets, and large language models like GPT{--}3.5, BLOOMZ, and XGLM. These findings are observable for unseen low-resource languages to code-mixed scenarios involving high-resource languages."
}

@inproceedings{ustun-etal-2024-aya,
    title = "Aya Model: An Instruction Finetuned Open-Access Multilingual Language Model",
    venue = "ACL",
    author = {{\"U}st{\"u}n, Ahmet  and
      Aryabumi, Viraat  and
      Yong, Zheng  and
      Ko, Wei-Yin  and
      D{'}souza, Daniel  and
      Onilude, Gbemileke  and
      Bhandari, Neel  and
      Singh, Shivalika  and
      Ooi, Hui-Lee  and
      Kayid, Amr  and
      Vargus, Freddie  and
      Blunsom, Phil  and
      Longpre, Shayne  and
      Muennighoff, Niklas  and
      Fadaee, Marzieh  and
      Kreutzer, Julia  and
      Hooker, Sara},
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.acl-long.845/",
    doi = "10.18653/v1/2024.acl-long.845",
    pages = "15894--15939",
    abstract = "Recent breakthroughs in large language models (LLMs) have centered around a handful of data-rich languages. What does it take to broaden access to breakthroughs beyond first-class citizen languages? Our work introduces Aya, a massively multilingual generative language model that follows instructions in 101 languages of which over 50{\%} are considered as lower-resourced. Aya outperforms mT0 and BLOOMZ on the majority of tasks while covering double the number of languages. We introduce extensive new evaluation suites that broaden the state-of-art for multilingual eval across 99 languages {---}{---} including discriminative and generative tasks, human evaluation, and simulated win rates that cover both held-out tasks and in-distribution performance. Furthermore, we conduct detailed investigations on the optimal finetuning mixture composition, data pruning, as well as the toxicity, bias, and safety of our models."
}

@inproceedings{payoungkhamdee-etal-2024-empirical,
    title = "An Empirical Study of Multilingual Reasoning Distillation for Question Answering",
    venue = "EMNLP",
    author = "Payoungkhamdee, Patomporn  and
      Limkonchotiwat, Peerat  and
      Baek, Jinheon  and
      Manakul, Potsawee  and
      Udomcharoenchaikit, Can  and
      Chuangsuwanich, Ekapol  and
      Nutanong, Sarana",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-main.442/",
    doi = "10.18653/v1/2024.emnlp-main.442",
    pages = "7739--7751",
    abstract = "Reasoning is one crucial capability in Large Language Models (LLMs), allowing them to perform complex tasks such as solving math problems and multi-step planning. While reasoning capability can emerge in larger models, smaller ones usually have to rely on distillation to transfer this capability from a larger model. However, recent efforts to distill reasoning capabilities have focused mainly on English, leaving multilingual distillation underexplored. To address this gap, this paper examines existing English reasoning distillation methods that utilize a variety of positive rationales in multilingual settings and proposes d-CoT-nR, a novel approach that incorporates incorrect rationales as additional guidance. Empirical results from multilingual high-school examinations show that d-CoT-nR significantly surpasses the baseline, improving accuracy in unseen languages and correctness in step-by-step reasoning."
}

@inproceedings{phatthiyaphaibun-etal-2024-chie,
    title = "{CHIE}: Generative {MRC} Evaluation for in-context {QA} with Correctness, Helpfulness, Irrelevancy, and Extraneousness Aspects",
    venue = "GenBench @ EMNLP",
    author = "Phatthiyaphaibun, Wannaphong  and
      Nonesung, Surapon  and
      Limkonchotiwat, Peerat  and
      Udomcharoenchaikit, Can  and
      Sawatphol, Jitkapat  and
      Chuangsuwanich, Ekapol  and
      Nutanong, Sarana",
    editor = "Hupkes, Dieuwke  and
      Dankers, Verna  and
      Batsuren, Khuyagbaatar  and
      Kazemnejad, Amirhossein  and
      Christodoulopoulos, Christos  and
      Giulianelli, Mario  and
      Cotterell, Ryan",
    booktitle = "Proceedings of the 2nd GenBench Workshop on Generalisation (Benchmarking) in NLP",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.genbench-1.10/",
    doi = "10.18653/v1/2024.genbench-1.10",
    pages = "154--164",
    abstract = "The evaluation of generative models in Machine Reading Comprehension (MRC) presents distinct difficulties, as traditional metrics like BLEU, ROUGE, METEOR, Exact Match, and F1 score often struggle to capture the nuanced and diverse responses. While embedding-based metrics such as BERTScore and BARTScore focus on semantic similarity, they still fail to fully address aspects such as recognizing additional helpful information and rewarding contextual faithfulness. Recent advances in large language model (LLM) based metrics offer more fine-grained evaluations, but challenges such as score clustering remain. This paper introduces a multi-aspect evaluation framework, CHIE,incorporating aspects of \textbf{C}orrectness, \textbf{H}elpfulness, \textbf{I}rrelevance, and \textbf{E}xtraneousness. Our approach, which uses binary categorical values rather than continuous rating scales, aligns well with human judgments, indicating its potential as a comprehensive and effective evaluation method."
}

@inproceedings{zhang-eickhoff-2024-crocosum,
    title = "{C}ro{C}o{S}um: A Benchmark Dataset for Cross-Lingual Code-Switched Summarization",
    venue = "LREC-COLING",
    author = "Zhang, Ruochen  and
      Eickhoff, Carsten",
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italia",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.367/",
    pages = "4113--4126",
    abstract = "Cross-lingual summarization (CLS) has attracted increasing interest in recent years due to the availability of large-scale web-mined datasets and the advancements of multilingual language models. However, given the rareness of naturally occurring CLS resources, the majority of datasets are forced to rely on translation which can contain overly literal artifacts. This restricts our ability to observe naturally occurring CLS pairs that capture organic diction, including instances of code-switching. This alteration between languages in mid-message is a common phenomenon in multilingual settings yet has been largely overlooked in cross-lingual contexts due to data scarcity. To address this gap, we introduce CroCoSum, a dataset of cross-lingual code-switched summarization of technology news. It consists of over 24,000 English source articles and 18,000 human-written Chinese news summaries, with more than 92{\%} of the summaries containing code-switched phrases. For reference, we evaluate the performance of existing approaches including pipeline, end-to-end, and zero-shot methods. We show that leveraging existing CLS resources as a pretraining step does not improve performance on CroCoSum, indicating the limited generalizability of current datasets. Finally, we discuss the challenges of evaluating cross-lingual summarizers on code-switched generation through qualitative error analyses."
}

@inproceedings{hudi-etal-2024-disentangling,
    title = "Disentangling Pretrained Representation to Leverage Low-Resource Languages in Multilingual Machine Translation",
    venue = "LREC-COLING",
    author = "Hudi, Frederikus  and
      Qu, Zhi  and
      Kamigaito, Hidetaka  and
      Watanabe, Taro",
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italia",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.446/",
    pages = "4978--4989",
    abstract = "Multilingual neural machine translation aims to encapsulate multiple languages into a single model. However, it requires an enormous dataset, leaving the low-resource language (LRL) underdeveloped. As LRLs may benefit from shared knowledge of multilingual representation, we aspire to find effective ways to integrate unseen languages in a pre-trained model. Nevertheless, the intricacy of shared representation among languages hinders its full utilisation. To resolve this problem, we employed target language prediction and a central language-aware layer to improve representation in integrating LRLs. Focusing on improving LRLs in the linguistically diverse country of Indonesia, we evaluated five languages using a parallel corpus of 1,000 instances each, with experimental results measured by BLEU showing zero-shot improvement of 7.4 from the baseline score of 7.1 to a score of 15.5 at best. Further analysis showed that the gains in performance are attributed more to the disentanglement of multilingual representation in the encoder with the shift of the target language-specific representation in the decoder."
}

@inproceedings{laosaengpha-etal-2024-learning,
    title = "Learning Job Title Representation from Job Description Aggregation Network",
    venue = "ACL Findings",
    author = "Laosaengpha, Napat  and
      Tativannarat, Thanit  and
      Piansaddhayanon, Chawan  and
      Rutherford, Attapol  and
      Chuangsuwanich, Ekapol",
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-acl.77/",
    doi = "10.18653/v1/2024.findings-acl.77",
    pages = "1319--1329",
    abstract = "Learning job title representation is a vital process for developing automatic human resource tools. To do so, existing methods primarily rely on learning the title representation through skills extracted from the job description, neglecting the rich and diverse content within. Thus, we propose an alternative framework for learning job titles through their respective job description (JD) and utilize a Job Description Aggregator component to handle the lengthy description and bidirectional contrastive loss to account for the bidirectional relationship between the job title and its description. We evaluated the performance of our method on both in-domain and out-of-domain settings, achieving a superior performance over the skill-based approach."
}

@inproceedings{yong-etal-2024-lexc,
    title = "{L}ex{C}-Gen: Generating Data for Extremely Low-Resource Languages with Large Language Models and Bilingual Lexicons",
    venue = "EMNLP Findings",
    author = "Yong, Zheng Xin  and
      Menghini, Cristina  and
      Bach, Stephen",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.818/",
    doi = "10.18653/v1/2024.findings-emnlp.818",
    pages = "13990--14009",
    abstract = "Data scarcity in low-resource languages can be addressed with word-to-word translations from labeled task data in high-resource languages using bilingual lexicons. However, bilingual lexicons often have limited lexical overlap with task data, which results in poor translation coverage and lexicon utilization. We propose lexicon-conditioned data generation LexC-Gen, a method that generates low-resource-language classification task data at scale. Specifically, LexC-Gen first uses high-resource-language words from bilingual lexicons to generate lexicon-compatible task data, and then it translates them into low-resource languages with bilingual lexicons via word translation. Across 17 extremely low-resource languages, LexC-Gen generated data is competitive with expert-translated gold data, and yields on average 5.6 and 8.9 points improvement over existing lexicon-based word translation methods on sentiment analysis and topic classification tasks respectively. Through ablation study, we show that conditioning on bilingual lexicons is the key component of LexC-Gen. LexC-Gen serves as a potential solution to close the performance gap between open-source multilingual models, such as BLOOMZ and Aya-101, and state-of-the-art commercial models like GPT-4o on low-resource-language tasks."
}

@inproceedings{limkonchotiwat-etal-2024-mccrolin,
    title = "{M}c{C}rolin: Multi-consistency Cross-lingual Training for Retrieval Question Answering",
    venue = "EMNLP Findings",
    author = "Limkonchotiwat, Peerat  and
      Ponwitayarat, Wuttikorn  and
      Lowphansirikul, Lalita  and
      Manakul, Potsawee  and
      Udomcharoenchaikit, Can  and
      Chuangsuwanich, Ekapol  and
      Nutanong, Sarana",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.157/",
    doi = "10.18653/v1/2024.findings-emnlp.157",
    pages = "2780--2793",
    abstract = "Automated question answering (QA) systems are increasingly relying on robust cross-lingual retrieval to identify and utilize information from multilingual sources, ensuring comprehensive and contextually accurate responses. Existing approaches often struggle with consistency across multiple languages and multi-size input scenarios. To address these challenges, we propose McCrolin, a Multi-consistency Cross-lingual training framework, leveraging multi-task learning to enhance cross-lingual consistency, ranking stability, and input-size robustness. Experimental results demonstrate that McCrolin achieves state-of-the-art performance on standard cross-lingual retrieval QA datasets. Furthermore, McCrolin outperforms competitors when dealing with various input sizes on downstream tasks. In terms of generalizability, results from further analysis show that our method is effective for various encoder architectures and sizes."
}

@inproceedings{pengpun-etal-2024-creating,
    title = "On Creating an {E}nglish-{T}hai Code-switched Machine Translation in Medical Domain",
    venue = "EMNLP Findings",
    author = "Pengpun, Parinthapat  and
      Tiankanon, Krittamate  and
      Chinkamol, Amrest  and
      Kinchagawat, Jiramet  and
      Chairuengjitjaras, Pitchaya  and
      Supholkhan, Pasit  and
      Aussavavirojekul, Pubordee  and
      Boonnag, Chiraphat  and
      Veerakanjana, Kanyakorn  and
      Phimsiri, Hirunkul  and
      Sae-jia, Boonthicha  and
      Sataudom, Nattawach  and
      Ittichaiwong, Piyalitt  and
      Limkonchotiwat, Peerat",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.351/",
    doi = "10.18653/v1/2024.findings-emnlp.351",
    pages = "6055--6073",
    abstract = "Machine translation (MT) in the medical domain plays a pivotal role in enhancing healthcare quality and disseminating medical knowledge. Despite advancements in English-Thai MT technology, common MT approaches often underperform in the medical field due to their inability to precisely translate medical terminologies. Our research prioritizes not merely improving translation accuracy but also maintaining medical terminology in English within the translated text through code-switched (CS) translation. We developed a method to produce CS medical translation data, fine-tuned a CS translation model with this data, and evaluated its performance against strong baselines, such as Google Neural Machine Translation (NMT) and GPT-3.5/GPT-4. Our model demonstrated competitive performance in automatic metrics and was highly favored in human preference evaluations. Our evaluation result also shows that medical professionals significantly prefer CS translations that maintain critical English terms accurately, even if it slightly compromises fluency. Our code and test set are publicly available https://github.com/preceptorai-org/NLLB{\_}CS{\_}EM{\_}NLP2024."
}

@inproceedings{li-etal-2024-preference,
    title = "Preference Tuning For Toxicity Mitigation Generalizes Across Languages",
    venue = "EMNLP Findings",
    author = "Li, Xiaochen  and
      Yong, Zheng Xin  and
      Bach, Stephen",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.784/",
    doi = "10.18653/v1/2024.findings-emnlp.784",
    pages = "13422--13440",
    abstract = "Detoxifying multilingual Large Language Models (LLMs) has become crucial due to their increasing global use. In this work, we explore zero-shot cross-lingual generalization of preference tuning in detoxifying LLMs. Unlike previous studies that show limited cross-lingual generalization for other safety tasks, we demonstrate that Direct Preference Optimization (DPO) training with only English data can significantly reduce toxicity in multilingual open-ended generations. For example, the probability of mGPT-1.3B generating toxic continuations drops from 46.8{\%} to 3.9{\%} across 17 different languages after training. Our results also extend to other multilingual LLMs, such as BLOOM, Llama3, and Aya-23. Using mechanistic interpretability tools like causal intervention and activation analysis, we identified the dual multilinguality property of MLP layers in LLMs, which explains the cross-lingual generalization of DPO. Finally, we show that bilingual sentence retrieval can predict the cross-lingual transferability of DPO preference tuning."
}

@inproceedings{forde-etal-2024-evaluating,
    title = "Re-Evaluating Evaluation for Multilingual Summarization",
    venue = "EMNLP",
    author = "Forde, Jessica Zosa  and
      Zhang, Ruochen  and
      Sutawika, Lintang  and
      Aji, Alham Fikri  and
      Cahyawijaya, Samuel  and
      Winata, Genta Indra  and
      Wu, Minghao  and
      Eickhoff, Carsten  and
      Biderman, Stella  and
      Pavlick, Ellie",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-main.1085/",
    doi = "10.18653/v1/2024.emnlp-main.1085",
    pages = "19476--19493",
    abstract = "Automatic evaluation approaches (ROUGE, BERTScore, LLM-based evaluators) have been widely used to evaluate summarization tasks. Despite the complexities of script differences and tokenization, these approaches have been indiscriminately applied to summarization across multiple languages. While previous works have argued that these approaches correlate strongly with human ratings in English, it remains unclear whether the conclusion holds for other languages. To answer this question, we construct a small-scale pilot dataset containing article-summary pairs and human ratings in English, Chinese and Indonesian. To measure the strength of summaries, our ratings are measured as head-to-head comparisons with resulting Elo scores across four dimensions. Our analysis reveals that standard metrics are unreliable measures of quality, and that these problems are exacerbated in Chinese and Indonesian. We advocate for more nuanced and careful considerations in designing a robust evaluation framework for multiple languages."
}

@inproceedings{urailertprasert-etal-2024-sea,
    title = "{SEA}-{VQA}: {S}outheast {A}sian Cultural Context Dataset For Visual Question Answering",
    venue = "ALVR @ ACL",
    author = "Urailertprasert, Norawit  and
      Limkonchotiwat, Peerat  and
      Suwajanakorn, Supasorn  and
      Nutanong, Sarana",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.alvr-1.15/",
    doi = "10.18653/v1/2024.alvr-1.15",
    pages = "173--185",
    abstract = "Visual Question Answering (VQA) is a critical task that requires the simultaneous understanding of visual and textual information. While significant advancements have been made with multilingual datasets, these often lack cultural specificity, especially in the context of Southeast Asia (SEA). In this paper, we introduce SEA-VQA aiming to highlight the challenges and gaps in existing VQA models when confronted with culturally specific content. Our dataset includes images from eight SEA countries, curated from the UNESCO Cultural Heritage collection. Our evaluation, comparing GPT-4 and GEMINI models, demonstrates substantial performance drops on culture-centric questions compared to the A-OKVQA dataset, a commonsense and world-knowledge VQA benchmark comprising approximately 25,000 questions. Our findings underscore the importance of cultural diversity in VQA datasets and reveal substantial gaps in the ability of current VQA models to handle culturally rich contexts. SEA-VQA serves as a crucial benchmark for identifying these gaps and guiding future improvements in VQA systems."
}

@inproceedings{adilazuarda-etal-2022-indorobusta,
    title = "{I}ndo{R}obusta: Towards Robustness Against Diverse Code-Mixed {I}ndonesian Local Languages",
    venue = "SUMEval @ ACL",
    author = "Adilazuarda, Muhammad Farid  and
      Cahyawijaya, Samuel  and
      Winata, Genta Indra  and
      Fung, Pascale  and
      Purwarianti, Ayu",
    editor = "Ahuja, Kabir  and
      Anastasopoulos, Antonios  and
      Patra, Barun  and
      Neubig, Graham  and
      Choudhury, Monojit  and
      Dandapat, Sandipan  and
      Sitaram, Sunayana  and
      Chaudhary, Vishrav",
    booktitle = "Proceedings of the First Workshop on Scaling Up Multilingual Evaluation",
    month = nov,
    year = "2022",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.sumeval-1.5/",
    doi = "10.18653/v1/2022.sumeval-1.5",
    pages = "25--34",
    abstract = "Significant progress has been made on Indone-
    sian NLP. Nevertheless, exploration of the code-
    mixing phenomenon in Indonesian is limited,
    despite many languages being frequently mixed
    with Indonesian in daily conversation. In this
    work, we explore code-mixing in Indonesian
    with four embedded languages, i.e., English,
    Sundanese, Javanese, and Malay; and intro-
    duce IndoRobusta1, a framework to evalu-
    ate and improve the code-mixing robustness.
    Our analysis shows that the pre-training cor-
    pus bias affects the model’s ability to better
    handle Indonesian-English code-mixing when
    compared to other local languages, despite hav-
    ing higher language diversity"
}

@inproceedings{koto-etal-2023-large,
    title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}",
    venue = "EMNLP",
    author = "Koto, Fajri  and
      Aisyah, Nurul  and
      Li, Haonan  and
      Baldwin, Timothy",
    editor = "Bouamor, Houda  and
      Pino, Juan  and
      Bali, Kalika",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.emnlp-main.760/",
    doi = "10.18653/v1/2023.emnlp-main.760",
    pages = "12359--12374",
    abstract = "Although large language models (LLMs) are often pre-trained on large-scale multilingual texts, their reasoning abilities and real-world knowledge are mainly evaluated based on English datasets. Assessing LLM capabilities beyond English is increasingly vital but hindered due to the lack of suitable datasets. In this work, we introduce IndoMMLU, the first multi-task language understanding benchmark for Indonesian culture and languages, which consists of questions from primary school to university entrance exams in Indonesia. By employing professional teachers, we obtain 14,981 questions across 64 tasks and education levels, with 46{\%} of the questions focusing on assessing proficiency in the Indonesian language and knowledge of nine local languages and cultures in Indonesia. Our empirical evaluations show that GPT-3.5 only manages to pass the Indonesian primary school level, with limited knowledge of local Indonesian languages and culture. Other smaller models such as BLOOMZ and Falcon perform at even lower levels."
}

@inproceedings{zhang-etal-2023-multilingual,
    title = "Multilingual Large Language Models Are Not (Yet) Code-Switchers",
    venue = "EMNLP",
    author = "Zhang, Ruochen  and
      Cahyawijaya, Samuel  and
      Cruz, Jan Christian Blaise  and
      Winata, Genta  and
      Aji, Alham Fikri",
    editor = "Bouamor, Houda  and
      Pino, Juan  and
      Bali, Kalika",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.emnlp-main.774/",
    doi = "10.18653/v1/2023.emnlp-main.774",
    pages = "12567--12582",
    abstract = "Multilingual Large Language Models (LLMs) have recently shown great capabilities in a wide range of tasks, exhibiting state-of-the-art performance through zero-shot or few-shot prompting methods. While there have been extensive studies on their abilities in monolingual tasks, the investigation of their potential in the context of code-switching (CSW), the practice of alternating languages within an utterance, remains relatively uncharted. In this paper, we provide a comprehensive empirical analysis of various multilingual LLMs, benchmarking their performance across four tasks: sentiment analysis, machine translation, summarization and word-level language identification. Our results indicate that despite multilingual LLMs exhibiting promising outcomes in certain tasks using zero or few-shot prompting, they still underperform in comparison to fine-tuned models of much smaller scales. We argue that current ``multilingualism' in LLMs does not inherently imply proficiency with code-switching texts, calling for future research to bridge this discrepancy."
}

@inproceedings{cahyawijaya-etal-2023-nusacrowd,
    title = "{N}usa{C}rowd: Open Source Initiative for {I}ndonesian {NLP} Resources",
    venue = "ACL Findings",
    author = "Cahyawijaya, Samuel  and
      Lovenia, Holy  and
      Aji, Alham Fikri  and
      Winata, Genta  and
      Wilie, Bryan  and
      Koto, Fajri  and
      Mahendra, Rahmad  and
      Wibisono, Christian  and
      Romadhony, Ade  and
      Vincentio, Karissa  and
      Santoso, Jennifer  and
      Moeljadi, David  and
      Wirawan, Cahya  and
      Hudi, Frederikus  and
      Wicaksono, Muhammad Satrio  and
      Parmonangan, Ivan  and
      Alfina, Ika  and
      Putra, Ilham Firdausi  and
      Rahmadani, Samsul  and
      Oenang, Yulianti  and
      Septiandri, Ali  and
      Jaya, James  and
      Dhole, Kaustubh  and
      Suryani, Arie  and
      Putri, Rifki Afina  and
      Su, Dan  and
      Stevens, Keith  and
      Nityasya, Made Nindyatama  and
      Adilazuarda, Muhammad  and
      Hadiwijaya, Ryan  and
      Diandaru, Ryandito  and
      Yu, Tiezheng  and
      Ghifari, Vito  and
      Dai, Wenliang  and
      Xu, Yan  and
      Damapuspita, Dyah  and
      Wibowo, Haryo  and
      Tho, Cuk  and
      Karo Karo, Ichwanul  and
      Fatyanosa, Tirana  and
      Ji, Ziwei  and
      Neubig, Graham  and
      Baldwin, Timothy  and
      Ruder, Sebastian  and
      Fung, Pascale  and
      Sujaini, Herry  and
      Sakti, Sakriani  and
      Purwarianti, Ayu",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-acl.868/",
    doi = "10.18653/v1/2023.findings-acl.868",
    pages = "13745--13818",
    abstract = "We present NusaCrowd, a collaborative initiative to collect and unify existing resources for Indonesian languages, including opening access to previously non-public resources. Through this initiative, we have brought together 137 datasets and 118 standardized data loaders. The quality of the datasets has been assessed manually and automatically, and their value is demonstrated through multiple experiments.NusaCrowd{'}s data collection enables the creation of the first zero-shot benchmarks for natural language understanding and generation in Indonesian and the local languages of Indonesia. Furthermore, NusaCrowd brings the creation of the first multilingual automatic speech recognition benchmark in Indonesian and the local languages of Indonesia. Our work strives to advance natural language processing (NLP) research for languages that are under-represented despite being widely spoken."
}

@inproceedings{yong-etal-2023-prompting,
    title = "Prompting Multilingual Large Language Models to Generate Code-Mixed Texts: The Case of South {E}ast {A}sian Languages",
    venue = "CALCS-ACL",
    author = "Yong, Zheng Xin  and
      Zhang, Ruochen  and
      Forde, Jessica  and
      Wang, Skyler  and
      Subramonian, Arjun  and
      Lovenia, Holy  and
      Cahyawijaya, Samuel  and
      Winata, Genta  and
      Sutawika, Lintang  and
      Cruz, Jan Christian Blaise  and
      Tan, Yin Lin  and
      Phan, Long  and
      Phan, Long  and
      Garcia, Rowena  and
      Solorio, Thamar  and
      Aji, Alham Fikri",
    editor = "Winata, Genta  and
      Kar, Sudipta  and
      Zhukova, Marina  and
      Solorio, Thamar  and
      Diab, Mona  and
      Sitaram, Sunayana  and
      Choudhury, Monojit  and
      Bali, Kalika",
    booktitle = "Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.calcs-1.5/",
    pages = "43--63",
    abstract = "While code-mixing is a common linguistic practice in many parts of the world, collecting high-quality and low-cost code-mixed data remains a challenge for natural language processing (NLP) research. The recent proliferation of Large Language Models (LLMs) compels one to ask: how capable are these systems in generating code-mixed data? In this paper, we explore prompting multilingual LLMs in a zero-shot manner to generate code-mixed data for seven languages in South East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese, Tamil, and Singlish. We find that publicly available multilingual instruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of producing texts with phrases or clauses from different languages. ChatGPT exhibits inconsistent capabilities in generating code-mixed texts, wherein its per-formance varies depending on the prompt template and language pairing. For instance, ChatGPT generates fluent and natural Singlish texts (an English-based creole spoken in Singapore), but for English-Tamil language pair, the system mostly produces grammatically incorrect or semantically meaningless utterances. Furthermore, it may erroneously introduce languages not specified in the prompt. Based on our investigation, existing multilingual LLMs exhibit a wide range of proficiency in code-mixed data generation for SEA languages. As such, we advise against using LLMs in this context without extensive human checks."
}

@inproceedings{velasco-etal-2023-towards,
    title = "Towards Automatic Construction of {F}ilipino {W}ord{N}et: Word Sense Induction and Synset Induction Using Sentence Embeddings",
    venue = "SEALP @ ACL",
    author = "Velasco, Dan John  and
      Alba, Axel  and
      Pelagio, Trisha Gail  and
      Ramirez, Bryce Anthony  and
      Cruz, Jan Christian Blaise  and
      Chua, Unisse  and
      Samson, Briane Paul  and
      Cheng, Charibeth",
    editor = "Wijaya, Derry  and
      Aji, Alham Fikri  and
      Vania, Clara  and
      Winata, Genta Indra  and
      Purwarianti, Ayu",
    booktitle = "Proceedings of the First Workshop in South East Asian Language Processing",
    month = nov,
    year = "2023",
    address = "Nusa Dua, Bali, Indonesia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.sealp-1.1/",
    doi = "10.18653/v1/2023.sealp-1.1",
    pages = "1--12"
}

@inproceedings{cahyawijaya-etal-2023-nusawrites,
    title = "{N}usa{W}rites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages",
    venue = "IJCNLP @ AACL",
    author = "Cahyawijaya, Samuel  and
      Lovenia, Holy  and
      Koto, Fajri  and
      Adhista, Dea  and
      Dave, Emmanuel  and
      Oktavianti, Sarah  and
      Akbar, Salsabil  and
      Lee, Jhonson  and
      Shadieq, Nuur  and
      Cenggoro, Tjeng Wawan  and
      Linuwih, Hanung  and
      Wilie, Bryan  and
      Muridan, Galih  and
      Winata, Genta  and
      Moeljadi, David  and
      Aji, Alham Fikri  and
      Purwarianti, Ayu  and
      Fung, Pascale",
    editor = "Park, Jong C.  and
      Arase, Yuki  and
      Hu, Baotian  and
      Lu, Wei  and
      Wijaya, Derry  and
      Purwarianti, Ayu  and
      Krisnadhi, Adila Alfa",
    booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = nov,
    year = "2023",
    address = "Nusa Dua, Bali",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.ijcnlp-main.60/",
    doi = "10.18653/v1/2023.ijcnlp-main.60",
    pages = "921--945"
}

@inproceedings{kautsar-etal-2023-indotod,
    title = "{I}ndo{T}o{D}: A Multi-Domain {I}ndonesian Benchmark For End-to-End Task-Oriented Dialogue Systems",
    venue = "SEALP @ AACL",
    author = "Kautsar, Muhammad  and
      Nurdini, Rahmah  and
      Cahyawijaya, Samuel  and
      Winata, Genta  and
      Purwarianti, Ayu",
    editor = "Wijaya, Derry  and
      Aji, Alham Fikri  and
      Vania, Clara  and
      Winata, Genta Indra  and
      Purwarianti, Ayu",
    booktitle = "Proceedings of the First Workshop in South East Asian Language Processing",
    month = nov,
    year = "2023",
    address = "Nusa Dua, Bali, Indonesia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.sealp-1.7/",
    doi = "10.18653/v1/2023.sealp-1.7",
    pages = "85--99"
}

@article{kotoIndoCultureExploringGeographically2024,
  title = {{{IndoCulture}}: {{Exploring Geographically Influenced Cultural Commonsense Reasoning Across Eleven Indonesian Provinces}}},
  venue = {TACL},
  shorttitle = {{{IndoCulture}}},
  author = {Koto, Fajri and Mahendra, Rahmad and Aisyah, Nurul and Baldwin, Timothy},
  year = {2024},
  month = dec,
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {12},
  pages = {1703--1719},
  issn = {2307-387X},
  doi = {10.1162/tacl_a_00726},
  url = {https://doi.org/10.1162/tacl\_a\_00726},
  urldate = {2025-09-12}
}

@misc{liBactrianXMultilingualReplicable2023,
  title = {Bactrian-{{X}}: {{Multilingual Replicable Instruction-Following Models}} with {{Low-Rank Adaptation}}},
  shorttitle = {Bactrian-{{X}}},
  author = {Li, Haonan and Koto, Fajri and Wu, Minghao and Aji, Alham Fikri and Baldwin, Timothy},
  year = {2023},
  month = oct,
  number = {arXiv:2305.15011},
  eprint = {2305.15011},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.15011},
  url = {http://arxiv.org/abs/2305.15011},
  urldate = {2025-09-12},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language}
}

@inproceedings{pengpunSeedFreeSyntheticData2024,
  title = {Seed-{{Free Synthetic Data Generation Framework}} for {{Instruction-Tuning LLMs}}: {{A Case Study}} in {{Thai}}},
  shorttitle = {Seed-{{Free Synthetic Data Generation Framework}} for {{Instruction-Tuning LLMs}}},
  venue = {Student Research Workshop @ ACL},
  booktitle = {Proceedings of the 62nd {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 4: {{Student Research Workshop}})},
  author = {Pengpun, Parinthapat and Udomcharoenchaikit, Can and Buaphet, Weerayut and Limkonchotiwat, Peerat},
  year = {2024},
  pages = {438--457},
  publisher = {Association for Computational Linguistics},
  address = {Bangkok, Thailand},
  doi = {10.18653/v1/2024.acl-srw.38},
  url = {https://aclanthology.org/2024.acl-srw.38},
  urldate = {2025-09-12},
  langid = {english}
}

@inproceedings{romeroCVQACulturallydiverseMultilingual2025,
  title = {{{CVQA}}: Culturally-Diverse Multilingual Visual Question Answering Benchmark},
  shorttitle = {{{CVQA}}},
  booktitle = {Proceedings of the 38th {{International Conference}} on {{Neural Information Processing Systems}}},
  author = {Romero, David and Lyu, Chenyang and Wibowo, Haryo Akbarianto and Lynn, Teresa and Hamed, Injy and Kishore, Aditya Nanda and Mandal, Aishik and Dragonetti, Alina and Abzaliev, Artem and Tonja, Atnafu Lambebo and Balcha, Bontu Fufa and Whitehouse, Chenxi and Salamea, Christian and Velasco, Dan John and Adelani, David Ifeoluwa and Le Meur, David and {Villa-Cueva}, Emilio and Koto, Fajri and Farooqui, Fauzan and Belcavello, Frederico and Batnasan, Ganzorig and Vallejo, Gisela and Caulfield, Grainne and Ivetta, Guido and Song, Haiyue and Ademtew, Henok Biadglign and Maina, Hern{\'a}n and Lovenia, Holy and Azime, Israel Abebe and Cruz, Jan Christian Blaise and Gala, Jay and Geng, Jiahui and {Ortiz-Barajas}, Jesus-German and Baek, Jinheon and Dunstan, Jocelyn and Alemany, Laura Alonso and Nagasinghe, Kumaranage Ravindu Yasas and Benotti, Luciana and D'Haro, Luis Fernando and Viridiano, Marcelo and {Estecha-Garitagoitia}, Marcos and Cabrera, Maria Camila Buitrago and {Rodr{\'i}guez-Cantelar}, Mario and Jouitteau, M{\'e}lanie and Mihaylov, Mihail and Etori, Naome and Imam, Mohamed Fazli Mohamed and Adilazuarda, Muhammad Farid and Gochoo, Munkhjargal and Otgonbold, Munkh-Erdene and Niyomugisha, Olivier and Silva, Paula M{\'o}nica and Chitale, Pranjal and Dabre, Raj and Chevi, Rendi and Zhang, Ruochen and Diandaru, Ryandito and Cahyawijaya, Samuel and G{\'o}ngora, Santiago and Jeong, Soyeong and Purkayastha, Sukannya and Kuribayashi, Tatsuki and Clifford, Teresa and Jayakumar, Thanmay and Torrent, Tiago Timponi and Ehsan, Toqeer and Araujo, Vladimir and Kementchedjhieva, Yova and Burzo, Zara and Lim, Zheng Wei and Yong, Zheng Xin and Ignat, Oana and Nwatu, Joan and Mihalcea, Rada and Solorio, Thamar and Aji, Alham Fikri},
  year = {2025},
  month = jun,
  series = {{{NeurIPS}} '24},
  volume = {37},
  pages = {11479--11505},
  doi = {10.5555/3737916.3738282},
  url = {https://dl.acm.org/doi/10.5555/3737916.3738282},
  publisher = {Curran Associates Inc.},
  address = {Red Hook, NY, USA},
  web = {https://cvqa-benchmark.org/},
  resources = {https://huggingface.co/datasets/afaji/cvqa},
  abstract = {Visual Question Answering (VQA) is an important task in multimodal AI, and it is often used to test the ability of vision-language models to understand and reason on knowledge present in both visual and textual data. However, most of the current VQA models use datasets that are primarily focused on English and a few major world languages, with images that are typically Western-centric. While recent efforts have tried to increase the number of languages covered on VQA datasets, they still lack diversity in low-resource languages. More importantly, although these datasets often extend their linguistic range via translation or some other approaches, they usually keep images the same, resulting in narrow cultural representation. To address these limitations, we construct CVQA, a new Culturally-diverse multilingual Visual Question Answering benchmark, designed to cover a rich set of languages and cultures, where we engage native speakers and cultural experts in the data collection process. As a result, CVQA includes culturally-driven images and questions from across 30 countries on four continents, covering 31 languages with 13 scripts, providing a total of 10k questions. We then benchmark several Multimodal Large Language Models (MLLMs) on CVQA, and show that the dataset is challenging for the current state-of-the-art models. This benchmark can serve as a probing evaluation suite for assessing the cultural capability and bias of multimodal models and hopefully encourage more research efforts toward increasing cultural awareness and linguistic diversity in this field.}
}

@inproceedings{yongLowResourceLanguagesJailbreak2023,
  title = {Low-{{Resource Languages Jailbreak GPT-4}}},
  award = {Best Paper Award},
  venue = {SoLaR @ NeurIPS},
  booktitle = {Socially {{Responsible Language Modelling Research}} ({{SoLaR}}) {{Workshop}} at {{Thirty-Seventh Annual Conference}} on {{Neural Information Processing Systems}}},
  author = {Yong, Zheng Xin and Menghini, Cristina and Bach, Stephen},
  year = {2023},
  month = nov,
  address = {New Orleans, USA},
  doi = {10.48550/arXiv.2310.02446},
  url = {https://arxiv.org/abs/2310.02446},
  pdf = {https://arxiv.org/pdf/2310.02446}
}

@inproceedings{winata-etal-2023-nusax,
    title = "{N}usa{X}: Multilingual Parallel Sentiment Dataset for 10 {I}ndonesian Local Languages",
    award = "EACL Outstanding Paper",
    author = "Winata, Genta Indra  and
      Aji, Alham Fikri  and
      Cahyawijaya, Samuel  and
      Mahendra, Rahmad  and
      Koto, Fajri  and
      Romadhony, Ade  and
      Kurniawan, Kemal  and
      Moeljadi, David  and
      Prasojo, Radityo Eko  and
      Fung, Pascale  and
      Baldwin, Timothy  and
      Lau, Jey Han  and
      Sennrich, Rico  and
      Ruder, Sebastian",
    editor = "Vlachos, Andreas  and
      Augenstein, Isabelle",
    booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
    month = may,
    year = "2023",
    address = "Dubrovnik, Croatia",
    publisher = "Association for Computational Linguistics",
    venue = "EACL",
    url = "https://aclanthology.org/2023.eacl-main.57/",
    doi = "10.18653/v1/2023.eacl-main.57",
    pages = "815--834",
    abstract = "Natural language processing (NLP) has a significant impact on society via technologies such as machine translation and search engines. Despite its success, NLP technology is only widely available for high-resource languages such as English and Chinese, while it remains inaccessible to many languages due to the unavailability of data resources and benchmarks. In this work, we focus on developing resources for languages in Indonesia. Despite being the second most linguistically diverse country, most languages in Indonesia are categorized as endangered and some are even extinct. We develop the first-ever parallel resource for 10 low-resource languages in Indonesia. Our resource includes sentiment and machine translation datasets, and bilingual lexicons. We provide extensive analyses and describe challenges for creating such resources. We hope this work can spark NLP research on Indonesian and other underrepresented languages."
}

@inproceedings{imperial-kochmar-2023-automatic,
    title = "Automatic Readability Assessment for Closely Related Languages",
    venue = "ACL Findings",
    author = "Imperial, Joseph Marvin  and
      Kochmar, Ekaterina",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-acl.331/",
    doi = "10.18653/v1/2023.findings-acl.331",
    pages = "5371--5386",
    abstract = "In recent years, the main focus of research on automatic readability assessment (ARA) has shifted towards using expensive deep learning-based methods with the primary goal of increasing models' accuracy. This, however, is rarely applicable for low-resource languages where traditional handcrafted features are still widely used due to the lack of existing NLP tools to extract deeper linguistic representations. In this work, we take a step back from the technical component and focus on how linguistic aspects such as mutual intelligibility or degree of language relatedness can improve ARA in a low-resource setting. We collect short stories written in three languages in the Philippines{---}Tagalog, Bikol, and Cebuano{---}to train readability assessment models and explore the interaction of data and features in various cross-lingual setups. Our results show that the inclusion of CrossNGO, a novel specialized feature exploiting n-gram overlap applied to languages with high mutual intelligibility, significantly improves the performance of ARA models compared to the use of off-the-shelf large multilingual language models alone. Consequently, when both linguistic representations are combined, we achieve state-of-the-art results for Tagalog and Cebuano, and baseline scores for ARA in Bikol."
}

@inproceedings{imperial-kochmar-2023-basahacorpus,
    title = "{B}asaha{C}orpus: An Expanded Linguistic Resource for Readability Assessment in {C}entral {P}hilippine Languages",
    venue = "EMNLP",
    author = "Imperial, Joseph Marvin  and
      Kochmar, Ekaterina",
    editor = "Bouamor, Houda  and
      Pino, Juan  and
      Bali, Kalika",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.emnlp-main.388/",
    doi = "10.18653/v1/2023.emnlp-main.388",
    pages = "6302--6309",
    abstract = "Current research on automatic readability assessment (ARA) has focused on improving the performance of models in high-resource languages such as English. In this work, we introduce and release BasahaCorpus as part of an initiative aimed at expanding available corpora and baseline models for readability assessment in lower resource languages in the Philippines. We compiled a corpus of short fictional narratives written in Hiligaynon, Minasbate, Karay-a, and Rinconada{---}languages belonging to the Central Philippine family tree subgroup{---}to train ARA models using surface-level, syllable-pattern, and n-gram overlap features. We also propose a new hierarchical cross-lingual modeling approach that takes advantage of a language{'}s placement in the family tree to increase the amount of available training data. Our study yields encouraging results that support previous work showcasing the efficacy of cross-lingual models in low-resource settings, as well as similarities in highly informative linguistic features for mutually intelligible languages."
}

@inproceedings{yong-etal-2023-bloom,
    title = "{BLOOM}+1: Adding Language Support to {BLOOM} for Zero-Shot Prompting",
    venue = "ACL",
    author = "Yong, Zheng Xin  and
      Schoelkopf, Hailey  and
      Muennighoff, Niklas  and
      Aji, Alham Fikri  and
      Adelani, David Ifeoluwa  and
      Almubarak, Khalid  and
      Bari, M Saiful  and
      Sutawika, Lintang  and
      Kasai, Jungo  and
      Baruwa, Ahmed  and
      Winata, Genta  and
      Biderman, Stella  and
      Raff, Edward  and
      Radev, Dragomir  and
      Nikoulina, Vassilina",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.653/",
    doi = "10.18653/v1/2023.acl-long.653",
    pages = "11682--11703",
    abstract = "The BLOOM model is a large publicly available multilingual language model, but its pretraining was limited to 46 languages. To extend the benefits of BLOOM to other languages without incurring prohibitively large costs, it is desirable to adapt BLOOM to new languages not seen during pretraining. In this work, we apply existing language adaptation strategies to BLOOM and benchmark its zero-shot prompting performance on eight new languages in a resource-constrained setting. We find language adaptation to be effective at improving zero-shot performance in new languages. Surprisingly, we find that adapter-based finetuning is more effective than continued pretraining for large models. In addition, we discover that prompting performance is not significantly affected by language specifics, such as the writing system. It is primarily determined by the size of the language adaptation data. We also add new languages to BLOOMZ, which is a multitask finetuned version of BLOOM capable of following task instructions zero-shot. We find including a new language in the multitask fine-tuning mixture to be the most effective method to teach BLOOMZ a new language. We conclude that with sufficient training data language adaptation can generalize well to diverse languages. Our code is available at \url{https://github.com/bigscience-workshop/multilingual-modeling}."
}

@inproceedings{pilar-etal-2023-cebuaner,
    title = "{C}ebua{NER}: A New Baseline {C}ebuano Named Entity Recognition Model",
    venue = "PACLIC @ ACL",
    author = "Pilar, Ma. Beatrice Emanuela  and
      Dedoroy, Dane  and
      Papas, Ellyza Mari  and
      Buenaventura, Mary Loise  and
      Montefalcon, Myron Darrel  and
      Padilla, Jay Rhald  and
      Imperial, Joseph Marvin  and
      Abisado, Mideth  and
      Maceda, Lany",
    editor = "Huang, Chu-Ren  and
      Harada, Yasunari  and
      Kim, Jong-Bok  and
      Chen, Si  and
      Hsu, Yu-Yin  and
      Chersoni, Emmanuele  and
      A, Pranav  and
      Zeng, Winnie Huiheng  and
      Peng, Bo  and
      Li, Yuxi  and
      Li, Junlin",
    booktitle = "Proceedings of the 37th Pacific Asia Conference on Language, Information and Computation",
    month = dec,
    year = "2023",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.paclic-1.79/",
    pdf = "https://aclanthology.org/2023.paclic-1.79.pdf",
    pages = "792--800",
    abstract = "Despite being one of the most linguistically
    diverse groups of countries, computational lin-
    guistics and language processing research in
    Southeast Asia has struggled to match the level
    of countries from the Global North. Thus, ini-
    tiatives such as open-sourcing corpora and the
    development of baseline models for basic lan-
    guage processing tasks are important stepping
    stones to encourage the growth of research ef-
    forts in the field. To answer this call, we in-
    troduce CEBUANER, a new baseline model
    for named entity recognition (NER) in the Ce-
    buano language. Cebuano is the second most-
    used native language in the Philippines with
    over 20 million speakers. To build the model,
    we collected and annotated over 4,000 news
    articles, the largest of any work in the language,
    retrieved from online local Cebuano platforms
    to train algorithms such as Conditional Ran-
    dom Field and Bidirectional LSTM. Our find-
    ings show promising results as a new baseline
    model, achieving over 70% performance on
    precision, recall, and F1 across all entity tags
    as well as potential efficacy in a crosslingual
    setup with Tagalog."
}

@inproceedings{pengpun-etal-2023-cross,
    title = "Cross-Lingual Data Augmentation For {T}hai Question-Answering",
    venue = "GenBench @ EMNLP",
    author = "Pengpun, Parinthapat  and
      Udomcharoenchaikit, Can  and
      Buaphet, Weerayut  and
      Limkonchotiwat, Peerat",
    editor = "Hupkes, Dieuwke  and
      Dankers, Verna  and
      Batsuren, Khuyagbaatar  and
      Sinha, Koustuv  and
      Kazemnejad, Amirhossein  and
      Christodoulopoulos, Christos  and
      Cotterell, Ryan  and
      Bruni, Elia",
    booktitle = "Proceedings of the 1st GenBench Workshop on (Benchmarking) Generalisation in NLP",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.genbench-1.16/",
    doi = "10.18653/v1/2023.genbench-1.16",
    pages = "193--203",
    abstract = "This paper presents an innovative data augmentation framework with data quality control designed to enhance the robustness of Question Answering (QA) models in low-resource languages, particularly Thai. Recognizing the challenges posed by the scarcity and quality of training data, we leverage data augmentation techniques in both monolingual and cross-lingual settings. Our approach augments and enriches the original dataset, thereby increasing its linguistic diversity and robustness. We evaluate the robustness of our framework on Machine Reading Comprehension, and the experimental results illustrate the potential of data augmentation to effectively increase training data and improve model generalization in low-resource language settings, offering a promising direction for the data augmentation manner."
}

@inproceedings{muennighoff-etal-2023-crosslingual,
    title = "Crosslingual Generalization through Multitask Finetuning",
    author = "Muennighoff, Niklas  and
      Wang, Thomas  and
      Sutawika, Lintang  and
      Roberts, Adam  and
      Biderman, Stella  and
      Le Scao, Teven  and
      Bari, M Saiful  and
      Shen, Sheng  and
      Yong, Zheng Xin  and
      Schoelkopf, Hailey  and
      Tang, Xiangru  and
      Radev, Dragomir  and
      Aji, Alham Fikri  and
      Almubarak, Khalid  and
      Albanie, Samuel  and
      Alyafeai, Zaid  and
      Webson, Albert  and
      Raff, Edward  and
      Raffel, Colin",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    venue = "ACL",
    url = "https://aclanthology.org/2023.acl-long.891/",
    doi = "10.18653/v1/2023.acl-long.891",
    pages = "15991--16111",
    abstract = "Multitask prompted finetuning (MTF) has been shown to help large language models generalize to new tasks in a zero-shot setting, but so far explorations of MTF have focused on English data and models. We apply MTF to the pretrained multilingual BLOOM and mT5 model families to produce finetuned variants called BLOOMZ and mT0. We find finetuning large multilingual language models on English tasks with English prompts allows for task genrealization to non-English languages that appear only in the pretraining corpus. Finetuning on multilingual tasks with English prompts further improves performance on English and non-English tasks leading to various state-of-the-art zero-shot results. We also investigate finetuning on multilingual tasks with prompts that have been machine-translated from English to match the language of each dataset. We find training on these machine-translated prompts leads to better performance on human-written prompts in the respective languages. Surprisingly, we find models are capable of zero-shot generalization to tasks in languages they have never intentionally seen. We conjecture that the models are learning higher-level capabilities that are both task- and language-agnostic. In addition, we introduce xP3, a composite of supervised datasets in 46 languages with English and machine-translated prompts. Our code, datasets and models are freely available at \url{https://github.com/bigscience-workshop/xmtf}."
}

@inproceedings{miranda-2023-developing,
    title = "Developing a Named Entity Recognition Dataset for {T}agalog",
    venue = "SEALP @ ACL",
    author = "Miranda, Lester James V.",
    editor = "Wijaya, Derry  and
      Aji, Alham Fikri  and
      Vania, Clara  and
      Winata, Genta Indra  and
      Purwarianti, Ayu",
    booktitle = "Proceedings of the First Workshop in South East Asian Language Processing",