InCLow-LM.github.io/publications.bib at main · InCLow-LM/InCLow-LM.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
# Arianna Bisazza
@misc{neplenbroek2025reading,
  author    = {Vera Neplenbroek and Arianna Bisazza and Raquel Fern\'{a}ndez},
  title     = {Reading Between the Prompts: How Stereotypes Shape {LLM}'s Implicit Personalization},
  year      = {2025},
  month     = {May},
  howpublished = {Accepted at EMNLP 2025. arXiv preprint arXiv:2505.16467},
  url       = {https://arxiv.org/abs/2505.16467},
  doi       = {10.48550/arXiv.2505.16467}
}

@inproceedings{lian2025simulating,
  author    = {Yuchen Lian and Arianna Bisazza and Tessa Verhoef},
  title     = {Simulating the Emergence of Differential Case Marking with Communicating Neural-Network Agents},
  booktitle = {Proceedings of the 47th Annual Conference of the Cognitive Science Society (CogSci)},
  year      = {2025},
  month     = {July},
  note      = {Accepted at CogSci 2025},
  url       = {https://escholarship.org/uc/item/5dr7h5tp},
  doi       = {10.48550/arXiv.2502.04038}
}

@inproceedings{neplenbroek-etal-2025-cross,
    title = "Cross-Lingual Transfer of Debiasing and Detoxification in Multilingual {LLM}s: An Extensive Investigation",
    author = "Neplenbroek, Vera  and
      Bisazza, Arianna  and
      Fern{\'a}ndez, Raquel",
    editor = "Che, Wanxiang  and
      Nabende, Joyce  and
      Shutova, Ekaterina  and
      Pilehvar, Mohammad Taher",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
    month = jul,
    year = "2025",
    address = "Vienna, Austria",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.findings-acl.145/",
    doi = "10.18653/v1/2025.findings-acl.145",
    pages = "2805--2830",
    ISBN = "979-8-89176-256-5",
    abstract = "Recent generative large language models (LLMs) show remarkable performance in non-English languages, but when prompted in those languages they tend to express higher harmful social biases and toxicity levels. Prior work has shown that finetuning on specialized datasets can mitigate this behavior, and doing so in English can transfer to other languages. In this work, we investigate the impact of different finetuning methods on the model{'}s bias and toxicity, but also on its ability to produce fluent and diverse text. We reduce biases by finetuning on curated non-harmful text, but find only direct preference optimization to be effective for mitigating toxicity. The mitigation caused by applying these methods in English also transfers to non-English languages. We find evidence that the extent to which transfer takes place can be predicted by the amount of data in a given language present in the model{'}s pretraining data. However, this transfer of bias and toxicity mitigation often comes at the expense of decreased language generation ability in non-English languages, highlighting the importance of developing language-specific bias and toxicity mitigation methods."
}

@inproceedings{haga2024babylm,
  author    = {Akari Haga and Akiyo Fukatsu and Miyu Oba and Arianna Bisazza and Yohei Oseki},
  title     = {{BabyLM} Challenge: Exploring the effect of variation sets on language model training efficiency},
  booktitle = {Proceedings of the BabyLM Challenge at CoNLL 2024},
  year      = {2024},
  month     = {November},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.conll-babylm.23},
  note      = {Outstanding paper award}
}

@inproceedings{lian2024nellcomx,
  author    = {Yuchen Lian and Tessa Verhoef and Arianna Bisazza},
  title     = {{NeLLCom-X}: A Comprehensive Neural-Agent Framework to Simulate Language Learning and Group Communication},
  booktitle = {Proceedings of the 28th Conference on Computational Natural Language Learning (CoNLL)},
  year      = {2024},
  month     = {November},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.conll-1.19},
  pages     = {220--234}
}

@inproceedings{neplenbroek2024mbbq,
  author    = {Vera Neplenbroek and Arianna Bisazza and Raquel Fern\'{a}ndez},
  title     = {{MBBQ}: {A} Dataset for Cross-Lingual Comparison of Stereotypes in Generative {LLMs}},
  booktitle = {Proceedings of the Conference on Language Modeling (CoLM)},
  year      = {2024},
  month     = {May},
  url       = {https://openreview.net/forum?id=X9yV4lFHt4},
  note      = {arXiv preprint arXiv:2406.07243}
}

@inproceedings{shen2024encoding,
  author    = {Gaofei Shen and Michaela Watkins and Afra Alishahi and Arianna Bisazza and Grzegorz Chrupa{\l}a},
  title     = {Encoding of lexical tone in self-supervised models of spoken language},
  booktitle = {Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
  year      = {2024},
  month     = {June},
  publisher = {Association for Computational Linguistics},
  address   = {Mexico City, Mexico},
  pages     = {4250--4261},
  url       = {https://aclanthology.org/2024.naacl-long.239},
  doi       = {10.18653/v1/2024.naacl-long.239}
}

@article{lian2023communication,
  author    = {Yuchen Lian and Arianna Bisazza and Tessa Verhoef},
  title     = {Communication Drives the Emergence of Language Universals in Neural Agents: Evidence from the Word-order/Case-marking Trade-off},
  journal   = {Transactions of the Association for Computational Linguistics (TACL)},
  year      = {2023},
  month     = {October},
  volume    = {11},
  pages     = {1033--1047},
  url       = {https://doi.org/10.1162/tacl_a_00587},
  doi       = {10.1162/tacl_a_00587}
}

@inproceedings{shen2023wave,
  author    = {Gaofei Shen and Afra Alishahi and Arianna Bisazza and Grzegorz Chrupa{\l}a},
  title     = {Wave to Syntax: Probing spoken language models for syntax},
  booktitle = {Proceedings of Interspeech 2023},
  year      = {2023},
  month     = {August},
  publisher = {ISCA},
  address   = {Dublin, Ireland},
  pages     = {1259--1263},
  url       = {https://www.isca-speech.org/archive/interspeech_2023/shen23_interspeech.html},
  doi       = {10.21437/Interspeech.2023-679}
}


# Jirui Qi
@article{qi2025models,
  title={When Models Reason in Your Language: Controlling Thinking Trace Language Comes at the Cost of Accuracy},
  author={Qi, Jirui and Chen, Shan and Xiong, Zidi and Fern{\'a}ndez, Raquel and Bitterman, Danielle S and Bisazza, Arianna},
  journal={arXiv preprint arXiv:2505.22888},
  year={2025},
  month=may,
}

@article{qi2025consistency,
  title={On the Consistency of Multilingual Context Utilization in Retrieval-Augmented Generation},
  author={Qi, Jirui and Fern{\'a}ndez, Raquel and Bisazza, Arianna},
  journal={arXiv preprint arXiv:2504.00597},
  month=apr,
  year={2025},
  url={https://arxiv.org/abs/2504.00597},
  abstract={Retrieval-augmented generation (RAG) with large language models (LLMs) has demonstrated strong performance in multilingual question-answering (QA) tasks by leveraging relevant passages retrieved from corpora. In multilingual RAG (mRAG), the retrieved passages can be written in languages other than that of the query entered by the user, making it challenging for LLMs to effectively utilize the provided information. Recent research suggests that retrieving passages from multilingual corpora can improve RAG performance, particularly for low-resource languages. However, the extent to which LLMs can leverage different kinds of multilingual contexts to generate accurate answers, *independently from retrieval quality*, remains understudied. In this paper, we conduct an extensive assessment of LLMs' ability to (i) make consistent use of a relevant passage regardless of its language, (ii) respond in the expected language, and (iii) focus on the relevant passage even when multiple `distracting' passages in different languages are provided in the context. Our experiments with four LLMs across three QA datasets covering a total of 48 languages reveal a surprising ability of LLMs to extract the relevant information from out-language passages, but a much weaker ability to formulate a full answer in the correct language. Our analysis, based on both accuracy and feature attribution techniques, further shows that distracting passages negatively impact answer quality regardless of their language. However, distractors in the query language exert a slightly stronger influence. Taken together, our findings deepen the understanding of how LLMs utilize context in mRAG systems, providing directions for future improvements.}
},
@inproceedings{qi-etal-2023-cross,
    title = "Cross-Lingual Consistency of Factual Knowledge in Multilingual Language Models",
    author = "Qi, Jirui  and
      Fern{\'a}ndez, Raquel  and
      Bisazza, Arianna",
    editor = "Bouamor, Houda  and
      Pino, Juan  and
      Bali, Kalika",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.emnlp-main.658/",
    doi = "10.18653/v1/2023.emnlp-main.658",
    pages = "10650--10666",
    abstract = "Multilingual large-scale Pretrained Language Models (PLMs) have been shown to store considerable amounts of factual knowledge, but large variations are observed across languages. With the ultimate goal of ensuring that users with different language backgrounds obtain consistent feedback from the same model, we study the cross-lingual consistency (CLC) of factual knowledge in various multilingual PLMs. To this end, we propose a Ranking-based Consistency (RankC) metric to evaluate knowledge consistency across languages independently from accuracy. Using this metric, we conduct an in-depth analysis of the determining factors for CLC, both at model level and at language-pair level. Among other results, we find that increasing model size leads to higher factual probing accuracy in most languages, but does not improve cross-lingual consistency. Finally, we conduct a case study on CLC when new factual associations are inserted in the PLMs via model editing. Results on a small sample of facts inserted in English reveal a clear pattern whereby the new piece of knowledge transfers only to languages with which English has a high RankC score. All code and data are released at https://github.com/Betswish/Cross-Lingual-Consistency."
},
@inproceedings{qi-etal-2024-model,
    title = "Model Internals-based Answer Attribution for Trustworthy Retrieval-Augmented Generation",
    author = "Qi, Jirui  and
      Sarti, Gabriele  and
      Fern{\'a}ndez, Raquel  and
      Bisazza, Arianna",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-main.347/",
    doi = "10.18653/v1/2024.emnlp-main.347",
    pages = "6037--6053",
    abstract = "Ensuring the verifiability of model answers is a fundamental challenge for retrieval-augmented generation (RAG) in the question answering (QA) domain. Recently, self-citation prompting was proposed to make large language models (LLMs) generate citations to supporting documents along with their answers. However, self-citing LLMs often struggle to match the required format, refer to non-existent sources, and fail to faithfully reflect LLMs' context usage throughout the generation. In this work, we present MIRAGE {--} Model Internals-based RAG Explanations {--} a plug-and-play approach using model internals for faithful answer attribution in RAG applications. MIRAGE detects context-sensitive answer tokens and pairs them with retrieved documents contributing to their prediction via saliency methods. We evaluate our proposed approach on a multilingual extractive QA dataset, finding high agreement with human answer attribution. On open-ended QA, MIRAGE achieves citation quality and efficiency comparable to self-citation while also allowing for a finer-grained control of attribution parameters. Our qualitative evaluation highlights the faithfulness of MIRAGE`s attributions and underscores the promising application of model internals for RAG answer attribution. Code and data released at https://github.com/Betswish/MIRAGE."
},
@inproceedings{chen-etal-2024-sifo,
    title = "The {SIF}o Benchmark: Investigating the Sequential Instruction Following Ability of Large Language Models",
    author = "Chen, Xinyi  and
      Liao, Baohao  and
      Qi, Jirui  and
      Eustratiadis, Panagiotis  and
      Monz, Christof  and
      Bisazza, Arianna  and
      de Rijke, Maarten",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.92/",
    doi = "10.18653/v1/2024.findings-emnlp.92",
    pages = "1691--1706",
    abstract = "Following multiple instructions is a crucial ability for large language models (LLMs). Evaluating this ability comes with significant challenges: (i) limited coherence between multiple instructions, (ii) positional bias where the order of instructions affects model performance, and (iii) a lack of objectively verifiable tasks. To address these issues, we introduce a benchmark designed to evaluate models' abilities to follow multiple instructions through sequential instruction following (SIFo) tasks. In SIFo, the successful completion of multiple instructions is verifiable by examining only the final instruction. Our benchmark evaluates instruction following using four tasks (text modification, question answering, mathematics, and security rule following), each assessing different aspects of sequential instruction following. Our evaluation of popular LLMs, both closed-source and open-source, shows that more recent and larger models significantly outperform their older and smaller counterparts on the SIFo tasks, validating the benchmark's effectiveness. All models struggle with following sequences of instructions, hinting at an important lack of robustness of today's language models."
},
@inproceedings{liu-etal-2025-pointwise,
    title = "Pointwise Mutual Information as a Performance Gauge for Retrieval-Augmented Generation",
    author = "Liu, Tianyu  and
      Qi, Jirui  and
      He, Paul  and
      Bisazza, Arianna  and
      Sachan, Mrinmaya  and
      Cotterell, Ryan",
    editor = "Chiruzzo, Luis  and
      Ritter, Alan  and
      Wang, Lu",
    booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = apr,
    year = "2025",
    address = "Albuquerque, New Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.naacl-long.78/",
    pages = "1628--1647",
    ISBN = "979-8-89176-189-6",
    abstract = "Recent work suggests that large language models enhanced with retrieval-augmented generation are easily influenced by the order in which the retrieved documents are presented to the model when solving tasks such as question answering (QA).However, there is no method to date that exploits this phenomenon to improve generation.To fill this gap, in this study, we show that the pointwise mutual information between a context and a question is an effective gauge for language model performance.Importantly, this gauge does not depend on knowing the answer to the question \textit{a priori}.Through experiments on two question-answering datasets using a variety of large language models, we find evidence for an empirical correlation between answer accuracy and pointwise mutual information.Additionally, we propose two methods that use the pointwise mutual information between a document and a question as a gauge for selecting and constructing prompts that lead to better performance, whose effectiveness we demonstrate through experimentation."
}


# Francesca Padovani

@misc{padovani2025childdirectedlanguagedoesconsistently,
      title={Child-Directed Language Does Not Consistently Boost Syntax Learning in Language Models},
      author={Francesca Padovani and Jaap Jumelet and Yevgen Matusevych and Arianna Bisazza},
      year={2025},
      month={may},
      eprint={2505.23689},
      booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
      howpublished = {Accepted at EMNLP 2025. arXiv preprint arXiv:2505.23689},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2505.23689},
}

@inproceedings{padovani2024automatic,
  title={Automatic Text Simplification: A Comparative Study in Italian for Children with Language Disorders},
  author={Padovani, Francesca and Marchesi, Caterina and Pasqua, Eleonora and Galletti, Martina and Nardi, Daniele},
  booktitle={Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning},
  pages={176--186},
  year={2024}
}


# Ezgi Başar

@article{bacsar2025turblimp,
  title={TurBLiMP: A Turkish Benchmark of Linguistic Minimal Pairs},
  author={Ba{\c{s}}ar, Ezgi and Padovani, Francesca and Jumelet, Jaap and Bisazza, Arianna},
  booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
  journal={arXiv preprint arXiv:2506.13487},
  year={2025}
}

# Jaap Jumelet
@misc{jumelet2025multiblimp10massivelymultilingual,
      title={MultiBLiMP 1.0: A Massively Multilingual Benchmark of Linguistic Minimal Pairs},
      author={Jaap Jumelet and Leonie Weissweiler and Arianna Bisazza},
      year={2025},
      eprint={2504.02768},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2504.02768},
}

# Yuqing Zhang

@inproceedings{yuqing2025nellcom-lex,
  title={NeLLCom-Lex: A Neural-agent Framework to Study the Interplay between Lexical Systems and Language Use},
  author={Zhang, Yuqing and Ürker, Ecesu and Verhoef, Tessa and Boleda, Gemma and Bisazza, Arianna},
  booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
  year={2025}
}

@inproceedings{zhang2024neural,
  title={Neural-agent Language Learning and Communication: Emergence of Dependency Length Minimization},
  author={Zhang, Yuqing and Verhoef, Tessa and van Noord, Gertjan and Bisazza, Arianna},
  booktitle={Proceedings of the Annual Meeting of the Cognitive Science Society},
  volume={46},
  year={2024}
}

@inproceedings{zhang2024endowing,
  title={Endowing neural language learners with human-like biases: A case study on dependency length minimization},
  author={Zhang, Yuqing and Verhoef, Tessa and van Noord, Gertjan and Bisazza, Arianna},
  booktitle={Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)},
  pages={5819--5832},
  year={2024}
}


# Gabriele Sarti
@inproceedings{candussio-etal-2025-bridging,
  author    = {Sara Candussio and Gaia Saveri and Gabriele Sarti and Luca Bortolussi},
  title     = {Bridging Logic and Learning: Decoding Temporal Logic Embeddings via Transformers},
  booktitle = {European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD)},
  year      = {2025},
  url       = {https://arxiv.org/abs/2507.07808}
}

@inproceedings{ghasemimadani-etal-2025-noiser,
  author    = {Mohammad Reza Ghasemi Madani and Aryo Pradipta Gema and Yu Zhao and Gabriele Sarti and Pasquale Minervini and Andrea Passerini},
  title     = {Noiser: Bounded Input Perturbations for Attributing Large Language Models},
  booktitle = {Second Conference on Language Modeling (COLM)},
  year      = {2025},
  url       = {https://arxiv.org/abs/2504.02911}
}


@inproceedings{sarti-etal-2025-unsupervised,
      title={Unsupervised Word-level Quality Estimation for Machine Translation Through the Lens of Annotators (Dis)agreement},
      author={Gabriele Sarti and Vilém Zouhar and Malvina Nissim and Arianna Bisazza},
      year={2025},
      booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
      month = dec,
      address = "Suzhou, China",
      publisher = "Association for Computational Linguistics",
      url={https://arxiv.org/abs/2505.23183}
}

@article{sarti-etal-2025-qe4pe,
      title={{QE4PE}: Word-level Quality Estimation for Human Post-Editing},
      author={Gabriele Sarti and Vilém Zouhar and Grzegorz Chrupała and Ana Guerberof-Arenas and Malvina Nissim and Arianna Bisazza},
      year={2025},
      journal={Transactions of the Association for Computational Linguistics},
      url={https://arxiv.org/abs/2503.03044}
}

@article{scalena-sarti-etal-2025-steering,
	title = {Steering Large Language Models for Machine Translation Personalization},
	author = {Daniel Scalena and Gabriele Sarti and Arianna Bisazza and Elisabetta Fersini and Malvina Nissim},
	year = 2025,
	journal = {Arxiv Preprint},
	url = {https://arxiv.org/abs/2505.16612}
}

@inproceedings{sarti-etal-2024-verbis,
    title = "Non Verbis, Sed Rebus: Large Language Models Are Weak Solvers of {I}talian Rebuses",
    author = "Sarti, Gabriele  and
      Caselli, Tommaso  and
      Nissim, Malvina  and
      Bisazza, Arianna",
    editor = "Dell'Orletta, Felice  and
      Lenci, Alessandro  and
      Montemagni, Simonetta  and
      Sprugnoli, Rachele",
    booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
    month = dec,
    year = "2024",
    address = "Pisa, Italy",
    publisher = "CEUR Workshop Proceedings",
    url = "https://aclanthology.org/2024.clicit-1.96/",
    pages = "888--897",
    ISBN = "979-12-210-7060-6",
}

@inproceedings{sarti-etal-2024-eurekarebus,
    title = "{E}ureka{R}ebus - Verbalized Rebus Solving with {LLM}s: A {CALAMITA} Challenge",
    author = "Sarti, Gabriele  and
      Caselli, Tommaso  and
      Bisazza, Arianna  and
      Nissim, Malvina",
    editor = "Dell'Orletta, Felice  and
      Lenci, Alessandro  and
      Montemagni, Simonetta  and
      Sprugnoli, Rachele",
    booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
    month = dec,
    year = "2024",
    address = "Pisa, Italy",
    publisher = "CEUR Workshop Proceedings",
    url = "https://aclanthology.org/2024.clicit-1.132/",
    pages = "1202--1208",
    ISBN = "979-12-210-7060-6",
}

@inproceedings{scalena-etal-2024-multi,
    title = "Multi-property Steering of Large Language Models with Dynamic Activation Composition",
    author = "Scalena, Daniel  and
      Sarti, Gabriele  and
      Nissim, Malvina",
    editor = "Belinkov, Yonatan  and
      Kim, Najoung  and
      Jumelet, Jaap  and
      Mohebbi, Hosein  and
      Mueller, Aaron  and
      Chen, Hanjie",
    booktitle = "Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
    month = nov,
    year = "2024",
    address = "Miami, Florida, US",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.blackboxnlp-1.34",
    doi = "10.18653/v1/2024.blackboxnlp-1.34",
    pages = "577--603",
}

@inproceedings{sarti-nissim-2024-it5-text,
    title = "{IT}5: Text-to-text Pretraining for {I}talian Language Understanding and Generation",
    author = "Sarti, Gabriele  and
      Nissim, Malvina",
    editor = "Calzolari, Nicoletta  and
      Kan, Min-Yen  and
      Hoste, Veronique  and
      Lenci, Alessandro  and
      Sakti, Sakriani  and
      Xue, Nianwen",
    booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
    month = may,
    year = "2024",
    address = "Torino, Italy",
    publisher = "ELRA and ICCL",
    url = "https://aclanthology.org/2024.lrec-main.823",
    pages = "9422--9433",
    abstract = "We introduce IT5, the first family of encoder-decoder transformer models pretrained specifically on Italian. We document and perform a thorough cleaning procedure for a large Italian corpus and use it to pretrain four IT5 model sizes. We then introduce the ItaGen benchmark, which includes a broad range of natural language understanding and generation tasks for Italian, and use it to evaluate the performance of IT5 models and multilingual baselines. We find monolingual IT5 models to provide the best scale-to-performance ratio across tested models, consistently outperforming their multilingual counterparts and setting a new state-of-the-art for Italian language generation.",
}

@article{ferrando-etal-2024-primer,
  author = {Javier Ferrando and Gabriele Sarti and Arianna Bisazza and Marta R. Costa-jussà},
  title = {A Primer on the Inner Workings of Transformer-based Language Models},
  date  = {2021},
  journal = {Arxiv Preprint},
  volume = {abs/2405.00208},
  url = {https://arxiv.org/abs/2405.00208},
}

@inproceedings{langedijk-etal-2024-decoderlens,
    title = "{D}ecoder{L}ens: Layerwise Interpretation of Encoder-Decoder Transformers",
    author = "Langedijk, Anna  and
      Mohebbi, Hosein  and
      Sarti, Gabriele  and
      Zuidema, Willem  and
      Jumelet, Jaap",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-naacl.296",
    pages = "4764--4780",
    abstract = "In recent years, several interpretability methods have been proposed to interpret the inner workings of Transformer models at different levels of precision and complexity.In this work, we propose a simple but effective technique to analyze encoder-decoder Transformers. Our method, which we name DecoderLens, allows the decoder to cross-attend representations of intermediate encoder activations instead of using the default final encoder output.The method thus maps uninterpretable intermediate vector representations to human-interpretable sequences of words or symbols, shedding new light on the information flow in this popular but understudied class of models.We apply DecoderLens to question answering, logical reasoning, speech recognition and machine translation models, finding that simpler subtasks are solved with high precision by low and intermediate encoder layers.",
}

@inproceedings{sarti-etal-2023-quantifying,
    title = "Quantifying the Plausibility of Context Reliance in Neural Machine Translation",
    author = "Sarti, Gabriele and
        Chrupa{\l}a, Grzegorz and
        Nissim, Malvina and
        Bisazza, Arianna",
    booktitle = "The Twelfth International Conference on Learning Representations (ICLR 2024)",
    month = may,
    year = "2024",
    address = "Vienna, Austria",
    publisher = "OpenReview",
    url = "https://openreview.net/forum?id=XTHfNGI3zT"
}

@inproceedings{sarti-etal-2023-ramp,
    title = "{RAMP}: Retrieval and Attribute-Marking Enhanced Prompting for Attribute-Controlled Translation",
    author = "Sarti, Gabriele  and
      Htut, Phu Mon  and
      Niu, Xing  and
      Hsu, Benjamin  and
      Currey, Anna  and
      Dinu, Georgiana  and
      Nadejde, Maria",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-short.126",
    pages = "1476--1490",
}

@article{edman-etal-2024-character,
    author = {Edman, Lukas and Sarti, Gabriele and Toral, Antonio and Noord, Gertjan van and Bisazza, Arianna},
    title = "{Are Character-level Translations Worth the Wait? Comparing ByT5 and mT5 for Machine Translation}",
    journal = {Transactions of the Association for Computational Linguistics},
    volume = {12},
    pages = {392-410},
    year = {2024},
    month = {04},
    issn = {2307-387X},
    doi = {10.1162/tacl_a_00651},
    url = {https://doi.org/10.1162/tacl\_a\_00651},
    eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00651/2364129/tacl\_a\_00651.pdf},
}

@inproceedings{sarti-etal-2023-inseq,
    title = "Inseq: An Interpretability Toolkit for Sequence Generation Models",
    author = "Sarti, Gabriele  and
      Feldhus, Nils  and
      Sickert, Ludwig  and
      van der Wal, Oskar",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-demo.40",
    pages = "421--435",
}

@inproceedings{sarti-etal-2024-democratizing,
    title = "Democratizing Advanced Attribution Analyses of Generative Language Models with the Inseq Toolkit",
    author = "Sarti, Gabriele  and
        Feldhus, Nils  and
        Qi, Jirui  and
        Nissim, Malvina and
        Bisazza, Arianna",
    booktitle = "xAI-2024 Late-breaking Work, Demos and Doctoral Consortium Joint Proceedings",
    month = jul,
    year = "2024",
    address = "Valletta, Malta",
    publisher = "CEUR.org",
    url = "https://ceur-ws.org/Vol-3793/paper_37.pdf",
    pages = "289--296",
}

@article{miaschi-etal-2022-probing,
  title     = "Probing Linguistic Knowledge in Italian Neural Language Models across Language Varieties",
  author    = "Miaschi, Alessio and Sarti, Gabriele and Brunato, Dominique and Dell'Orletta, Felice and Venturi, Giulia",
  journal   = "Italian Journal of Computational Linguistics (IJCoL)",
  publisher = "OpenEdition",
  volume    =  8,
  number    =  1,
  pages     = "25--44",
  month     =  jul,
  year      =  2022,
  url       = "https://journals.openedition.org/ijcol/965",
  issn      = "2499-4553",
  doi       = "10.4000/ijcol.965"
}

@inproceedings{sarti-etal-2022-divemt,
    title = "{D}iv{EMT}: Neural Machine Translation Post-Editing Effort Across Typologically Diverse Languages",
    author = "Sarti, Gabriele  and
      Bisazza, Arianna  and
      Guerberof-Arenas, Ana  and
      Toral, Antonio",
    editor = "Goldberg, Yoav  and
      Kozareva, Zornitsa  and
      Zhang, Yue",
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-main.532",
    doi = "10.18653/v1/2022.emnlp-main.532",
    pages = "7795--7816",
    abstract = "We introduce DivEMT, the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times and pauses were recorded, enabling an in-depth, cross-lingual evaluation of NMT quality and post-editing effectiveness. Using this new dataset, we assess the impact of two state-of-the-art NMT systems, Google Translate and the multilingual mBART-50 model, on translation productivity. We find that post-editing is consistently faster than translation from scratch. However, the magnitude of productivity gains varies widely across systems and languages, highlighting major disparities in post-editing effectiveness for languages at different degrees of typological relatedness to English, even when controlling for system architecture and training data size. We publicly release the complete dataset including all collected behavioral data, to foster new research on the translation capabilities of NMT systems for typologically diverse languages.",
}

@inproceedings{bianchi-etal-2023-contrastive,
  author       = {Federico Bianchi and
                  Giuseppe Attanasio and
                  Raphael Pisoni and
                  Silvia Terragni and
                  Gabriele Sarti and
                  Dario Balestri},
  editor       = {Federico Boschetti and
                  Gianluca E. Lebani and
                  Bernardo Magnini and
                  Nicole Novielli},
  title        = {Contrastive Language-Image Pre-training for the Italian Language},
  booktitle    = {Proceedings of the 9th Italian Conference on Computational Linguistics,
                  Venice, Italy, November 30 - December 2, 2023},
  series       = {{CEUR} Workshop Proceedings},
  volume       = {3596},
  publisher    = {CEUR-WS.org},
  year         = {2023},
  url          = {https://ceur-ws.org/Vol-3596/paper9.pdf},
}


# Yevgen Matusevych
@inproceedings{zhou-matusevych-2025-curse,
    title = "Curse of bilinguality: Evaluating monolingual and bilingual language models on {C}hinese linguistic benchmarks",
    author = "Zhou, Yuwen and Matusevych, Yevgen",
    booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
    year = "2025",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.gem-1.58/",
    pages = "622--630"
}

@inproceedings{oneata2025,
  author    = {Onea{\c{t}}{\u{a}}, Dan and Nortje, Leanne and Matusevych, Yevgen and Kamper, Herman},
  title     = {The mutual exclusivity bias of bilingual visually grounded speech models},
  booktitle = {Proceedings of Interspeech 2025},
  year      = {2025},
  publisher = {ISCA},
  pages     = {5043--5047},
  url       = {https://www.isca-archive.org/interspeech_2025/oneata25_interspeech.html},
  doi       = {10.21437/Interspeech.2025-343}
}