-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpubs.bib
More file actions
2337 lines (2141 loc) · 285 KB
/
pubs.bib
File metadata and controls
2337 lines (2141 loc) · 285 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Encoding: UTF-8
%%%%% Books %%%%%
@Book{Pell14e,
author = {Pellegrini, Alessandro},
publisher = {Sapienza Università Editrice},
title = {Parallelization of Discrete Event Simulation Models},
year = {2015},
isbn = {978-88-98533-59-6},
month = nov,
series = {Studi e Ricerche},
abstract = {Simulation is a powerful technique to represent the evolution of real-world phenomena or systems over time. It has been extensively used in different research fields (from medicine to biology, from economy, to disaster rescue) to study the behaviour of complex systems during their evolution (symbiotic simulation) or before their actual realization (what-if analysis).
A traditional way to achieve high performance simulations is the employment of Parallel Discrete Event Simulation (PDES) techniques, which are based on the partitioning of the simulation model into Logical Processes (LPs) that can execute events in parallel on different CPUs and/or different CPU cores, and rely on synchronization mechanisms to achieve causally consistent execution of simulation events. As it is well recognized, the optimistic synchronization approach, namely the Time Warp protocol, which is based on rollback for recovering possible timestamp-order violations due to the absence of block-until-safe policies for event processing, is likely to favour speedup in general application/architectural contexts.
However, the optimistic PDES paradigm implicitly relies on a programming model that drifts from traditional sequential-style programming, given that there is no notion of global address space (fully accessible while processing events at any LP). Furthermore, there is the underlying assumption that the code associated with event handlers cannot execute unrecoverable operations given their speculative processing nature. Nevertheless, even though no unrecoverable action is ever executed by event handlers, some means to actually undo the action upon request needs to be devised and implemented within the software stack.
On the other hand, sequential-style programming is a very easy paradigm for the development of simulation code, given that it does not require the programmer to reason about memory partitioning (and therefore message passing) and speculative (concurrent) processing of the application.
In this thesis, we present methodological and technical innovations which will show how it is possible, by developing innovative runtime mechanisms, to allow a programmer to implement his simulation model in a fully sequential way, and have the underlying simulation framework to execute it in parallel according to speculative processing techniques. Some of the approaches we provide show applicability in either shared- or distributed-memory systems, while others will be specifically tailored to multi/many-core architectures.
We will clearly show, during the development of these supports, what is the effect on performance of these solutions, which will nevertheless be negligible, allowing a fruitful exploitation of the available computing power. In the end, we will highlight which are the clear benefits on the programming model that the developer will experience by relying on these innovative solutions.},
}
%%%%% Books Chapters %%%%%
@InCollection{Rug15,
author = {Rughetti, Diego and Di Sanzo, Pierangelo and Pellegrini, Alessandro and Ciciani, Bruno and Quaglia, Francesco},
booktitle = {Transactional Memory. Foundations, Algorithms, Tools, and Applications},
publisher = {Springer International Publishing},
title = {Tuning the Level of Concurrency in Software Transactional Memory: An Overview of Recent Analytical, Machine Learning and Mixed Approaches},
year = {2015},
editor = {Guerraoui, Rachid and Romano, Paolo},
isbn = {978-3-319-14719-2},
pages = {395--417},
series = {Lecture Notes in Computer Science},
volume = {8913},
abstract = {Synchronization transparency offered by Software Transactional Memory (STM) must not come at the expense of run-time efficiency, thus demanding from the STM-designer the inclusion of mechanisms properly oriented to performance and other quality indexes. Particularly, one core issue to cope with in STM is related to exploiting parallelism while also avoiding thrashing phenomena due to excessive transaction rollbacks, caused by excessively high levels of contention on logical resources, namely concurrently accessed data portions. A means to address run-time efficiency consists in dynamically determining the best-suited level of concurrency (number of threads) to be employed for running the application (or specific application phases) on top of the STM layer. For too low levels of concurrency, parallelism can be hampered. Conversely, over-dimensioning the concurrency level may give rise to the aforementioned thrashing phenomena caused by excessive data contention—an aspect which has reflections also on the side of reduced energy-efficiency. In this chapter we overview a set of recent techniques aimed at building “application-specific” performance models that can be exploited to dynamically tune the level of concurrency to the best-suited value. Although they share some base concepts while modeling the system performance vs the degree of concurrency, these techniques rely on disparate methods, such as machine learning or analytic methods (or combinations of the two), and achieve different tradeoffs in terms of the relation between the precision of the performance model and the latency for model instantiation. Implications of the different tradeoffs in real-life scenarios are also discussed.},
doi = {10.1007/978-3-319-14720-8_18},
}
@InCollection{Qua14,
author = {Quaglia, Francesco and Pellegrini, Alessandro and Vitali, Roberto},
booktitle = {Modeling and Simulation-based Systems Engineering Handbook},
publisher = {Crc Pr I Llc},
title = {Reshuffling {PDES} Platforms for Multi/Many-core Machines: a Perspective with focus on Load Sharing},
year = {2014},
editor = {Gianni, Daniele and D'Ambrogio, Andrea and Tolk, Andreas},
isbn = {978-1-4665-7145-7},
month = dec,
pages = {203--232},
abstract = {In this chapter, we discuss some key aspects related to the reorganization process of these platforms and present in detail a recent literature approach exactly tackling this issue. The presentation is also targeted at showing how the approach, which is based on the symmetric multithreading software programming paradigm, can be suited for a change in the perspective on how to exploit computing resources for PDES applications in a balanced and effective manner. This is achieved via an innovative load-sharing paradigm suited for PDES systems run on top of multicore machines.},
doi = {10.1201/b17902-10},
}
%%%%% Journal Articles %%%%%
@article{Mar23b,
author = {Marotta, Romolo and Ianni, Mauro and Pellegrini, Alessandro and Quaglia, Francesco},
title = {A Conflict-Resilient Lock-Free Linearizable Calendar Queue},
journal = {ACM Transactions on Parallel Computing},
abstract = {In the last two decades, great attention has been devoted to the design of non-blocking and linearizable data structures, which enable exploiting the scaled-up degree of parallelism in off-the-shelf shared-memory multi-core machines. In this context, priority queues are highly challenging.
Indeed, concurrent attempts to extract the highest-priority item are prone to create detrimental thread conflicts that lead to abort/retry of the operations.
In this article, we present the first priority queue that jointly provides:
i) lock-freedom and linearizability;
ii) conflict resiliency against concurrent extractions;
iii) adaptiveness to different contention profiles; and
iv) amortized constant-time access for both insertions and extractions. Beyond presenting our solution, we also provide proof of its correctness based on an assertional approach.
Also, we present an experimental study on a 64-CPU machine, showing that our proposal provides performance improvements over state-of-the-art non-blocking priority queues. },
year = {2023},
month = dec,
issn = {2329-4949},
publisher = {ACM},
series = {TOPC},
doi = {10.1145/3635163},
}
@article{Car23,
author = {Carnà, Stefano and Marotta, Romolo and Pellegrini, Alessandro and Quaglia, Francesco},
title = {Strategies and Software Support for the Management of Hardware Performance Counters},
journal = {Software: Practice and Experience},
abstract = {Hardware Performance Counters (HPCs) are facilities offered by most off-the-shelf CPU architectures. They are a vital support to post-mortem performance profiling and are exploited by standard tools such as Linux or Intel V-Tune. Nevertheless, an increasing number of application domains (e.g., simulation, task-based high-performance computing, or cybersecurity) are exploiting them to perform different activities, such as self-tuning, autonomic optimization, and\slash or system inspection. This repurposing of HPCs can be difficult, e.g., because of the overhead for extracting relevant information. This overhead might render any online or self-tuning activity ineffective. This article discusses various practical strategies to exploit HPCs beyond post-mortem profiling, suitable for different application contexts. The presented strategies are accompanied by a general primer on HPCs usage on Linux. We also provide reference x86 (both Intel and AMD) implementations targeting the Linux kernel, upon which we present an experimental assessment of the viability of our proposals.},
year = {2023},
month = jul,
volume = {53},
issue = {10},
pages = {1928--1957},
issn = {1097-024X},
doi = {10.1002/spe.3236},
publisher = {Wiley},
series = {SPE},
}
@article{DiS23,
author = {Pierangelo di Sanzo and Francesco Quaglia},
title = {On the Effects of Transaction Data Access Patterns on Performance in Lock-Based Concurrency Control},
journal = {IEEE Trans. Computers},
volume = {72},
number = {6},
pages = {1718--1732},
year = {2023},
url = {https://doi.org/10.1109/TC.2022.3222084},
doi = {10.1109/TC.2022.3222084},
timestamp = {Fri, 02 Jun 2023 21:23:32 +0200},
biburl = {https://dblp.org/rec/journals/tc/SanzoQ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DeA23,
author = {De Angelis, Emanuele and De Angelis, Guglielmo and Pellegrini, Alessandro and Proietti, Maurizio},
title = {What Makes Test Programs Similar in Microservices Applications?},
journal = {Journal of Systems and Software},
abstract = {The emergence of microservice architecture calls for novel methodologies and technological frameworks that support the design, development, and maintenance of applications structured according to this new architectural style. In this paper, we consider the issue of designing suitable strategies for the governance of testing activities within the microservices paradigm. We focus on the problem of discovering implicit relations between test programs that help to avoid re-running all the available test suites each time one of its constituents evolves. We propose a dynamic analysis technique and its supporting framework that collects information about the invocations of local and remote APIs. Information on test program execution is obtained in two ways: instrumenting the test program code or running a symbolic execution engine. The extracted information is processed by a rule-based automated reasoning engine, which infers implicit similarities among test programs. We show that our analysis can be used to support the reduction of test suites. The proposed approach has been validated against two real-world microservice applications.},
year = {2023},
month = jan,
volume = {201},
pages = {111674},
issn = {0164-1212},
publisher = {Elsevier},
series = {SPE},
doi = {10.1016/j.jss.2023.111674},
}
@article{Pell22,
author = {Pellegrini, Alessandro and Di~Sanzo, Pierangelo and Piccione, Andrea and Quaglia, Francesco},
title = {Design and Implementation of a Fully-Transparent Partial Abort Support for Software Transactional Memory},
journal = {Software: Practice and Experience},
abstract = {Software Transactional Memory (STM) provides synchronization support to ensure atomicity and isolation when threads access shared data in concurrent applications. With STM, shared data accesses are encapsulated within transactions, which are automatically handled by the STM layer. Hence, programmers are not requested to use code-synchronization mechanisms explicitly, like locking.
In this article, we present our experience in designing and implementing a partial abort scheme for STM. The objective of our work is threefold: 1) enabling STM to undo only part of the transaction execution in the case of conflict, 2) designing a scheme that is fully transparent to programmers, thus also allowing to run existing STM applications without modifications, and 3) providing a scheme that can be easily integrated within existing STM runtime environments without altering their internal structure. The scheme that we designed is based on automated software instrumentation, which injects into the application capabilities to undo the required portions of transaction executions. Further, it can correctly undo also non-transactional operations executed on the stack and the heap during a transaction. This capability provides programmers with the advantage of writing transactional code without concerns about the side effects of aborted transactions on both shared and thread-private data. We integrated and evaluated our partial abort scheme within the TinySTM open-source library. We analyze the experimental results we achieved with common STM benchmark applications, focusing on the advantages and disadvantages of the proposed solutions for implementing our scheme's different components. Hence, we highlight the appropriate choices and possible solutions to improve partial abort schemes further.},
year = {2022},
month = jun,
volume = {52},
issue = {11},
pages = {2456--2475},
issn = {1097-024X},
publisher = {Wiley},
series = {SPE},
doi = {10.1002/spe.3134},
}
@Article{Car22,
author = {Carnà, Stefano and Ferracci, Serena and Quaglia, Francesco and Pellegrini, Alessandro},
title = {Fight Hardware with Hardware: System-wide Detection and Mitigation of Side-Channel Attacks using Performance Counters},
journal = {Digital Threats: Research and Practice},
year = {2022},
abstract = {We present a kernel-level infrastructure that allows system-wide detection of malicious applications attempting to exploit cache-based side-channel attacks to break the process confinement enforced by standard operating systems. This infrastructure relies on hardware performance counters to collect information at runtime from all applications running on the machine. High-level detection metrics are derived from these measurements to maximize the likelihood of promptly detecting a malicious application. Our experimental assessment shows that we can catch a large family of side-channel attacks with a significantly reduced overhead in the system. We also discuss countermeasures that can be enacted once a process is suspected of carrying out a side-channel attack to increase the overall tradeoff between the system’s security level and the delivered performance under non-suspected process executions.},
publisher = {ACM},
series = {DTRAP},
doi = {10.1145/3519601},
}
@Article{Mar22,
author = {Marotta, Romolo and Ianni, Mauro and Pellegrini, Alessandro and Quaglia, Francesco},
title = {NBBS: A Non-Blocking Buddy System for Multi-core Machines},
journal = {Transactions on Computers},
year = {2022},
mon = mar,
abstract = {Common implementations of core memory allocation components handle concurrent allocation/release requests by synchronizing threads via spin-locks. This approach is not prone to scale, a problem that has been addressed in the literature by introducing layered allocation services or replicating the core allocators—the bottom-most ones within the layered architecture. Both these solutions tend to reduce the pressure of actual concurrent accesses to each individual core allocator. In this article, we explore an alternative approach to scalability of memory allocation/release, which can be still combined with those literature proposals. We present a fully non-blocking buddy system, where threads performing concurrent allocations/releases do not undergo any spin-lock based synchronization. Our solution allows threads to proceed in parallel, and commit their allocations/releases unless a conflict is materialized while handling the allocator metadata—memory fragmentation and coalescing is also carried out in a fully non-blocking manner. Conflict detection relies in our solution on atomic Read-Modify-Write (RMW) machine instructions, guaranteed to execute atomically by the processor firmware. We also provide a proof of the correctness of our non-blocking buddy system and show the results of a comparative study that outlines the advantages of our solution with respect to the Linux-kernel buddy system, which is one of the most diffused and optimized buddy systems in the state of the art.},
publisher = {IEEE},
series = {TC},
doi = {10.1109/TC.2021.3060393},
volume = 71,
issue = 3,
pages = {599--612},
}
@Article{Sil21,
author = {Silvestri, Emiliano and Pellegrini, Alessandro and Di~Sanzo, Pierangelo and Quaglia, Francesco},
title = {Effective Runtime Management of Tasks and Priorities in GNU OpenMP Applications},
journal = {Transactions on Computers},
year = {2022},
month = oct,
abstract = {OpenMP has become a reference standard for the design of parallel applications. This standard is evolving very fast, thus offering ever new opportunities to the application programmers. However, OpenMP runtime environments are often not fully aligned to the actual requirements imposed by the evolution of such standard. Among the main lacks, we find: (a) a limited capability to effectively cope with task priorities, and (b) the inadequacy in guaranteeing core properties while processing tasks such as the so called work-conservativeness---the ability of the OpenMP runtime environment to fully exploit the underlying multi-processor/multi-core machine through the avoidance of thread-blocking phases. In this article we present the design of extensions to the GNU OpenMP (GOMP) implementation, integrated into gcc, which allow the effective management of tasks and their priorities. Our proposal is based on a user-space library---modularly combined with the one already offered by GOMP---and an external kernel-level Linux module---offering the opportunity to exploit raising hardware facilities for the purpose of task/priority management. We also provide experimental results showing the effectiveness of our proposal, achieved by running either OpenMP common benchmarks or a new benchmark application (named Hashtag-Text) that we explicitly devised in order to stress the OpenMP runtime environment in relation to the above-mentioned task/priority management aspects.},
publisher = {IEEE},
series = {TC},
volume = 71,
issue = 10,
pages = {2632--2645},
doi = {10.1109/TC.2021.3139463},
}
@Article{Gig21,
author = {Gigante, Gabriella and Palumbo, Roberto and Pascarella, Domenico and Pellegrini, Alessandro and Duca, Gabriella and Piera, Miquel Àngel and Ramos, Juan José},
title = {Support to Design for Air Traffic Management: An Approach with Agent-Based Modelling and Evolutionary Search},
journal = {International Journal of Aviation, Aeronautics, and Aerospace},
year = {2021},
volume = {8},
number = {1},
abstract = {To enhance Air Traffic Management (ATM) and meet the future traffic demand and environmental requirements, present ATM system is going to be modified (SESAR Joint Undertaking, 2017), designing new services to be integrated in future architecture considering the evolution of present fragmented structure of the airspace and the entanglement of air routes. Such a change process is complicated due to the nature of ATM, which is a large-scale Socio-Technical System (STS), typically involving a complex interaction between humans, machines and the environment. In such kind of systems, managing their evolution is a complex and difficult task since the social and technical implications of any proposed concept should be fully assessed before a choice is made whether or not to proceed with the related development. Often, simulation tools are also used to support the design of the concept itself by enabling what-if-analyses. However, these may be too effort and time consuming due to the exponential growth of the required analysis cases. A quite common mismatch between the performance evaluations in simulated conditions and those achieved in real life is represented by the partial assessment of human aspects that can be performed throughout the new concept lifecycle from its lowest maturity level up to “ready to market”.
The proposed work defines an approach to support the design of new ATM solutions, including the evaluation on human behaviour. The approach adopts a combined paradigm, which involves Agent-Based Modelling and Simulation (ABMS) to specify and analyse the ATM models, and Agent-based Evolutionary Search (AES) to optimize the design of the new solutions. A specific case study is used to demonstrate the effectiveness of the proposed approach. Transition from Direct Routing Airspace (DRA) to Free Routing Airspace (FRA), respectively described by Solution #32 and Solution #33 in the SESAR solutions catalogue (SESAR Joint Undertaking, 2017), is used for both validation and experimentation activities. In detail, the proposed experimentation case regards the design of sector collapsing/decollapsing configuration to optimize controller workloads. The achieved results are presented and discussed.},
series = {IJAAA},
doi = {10.15394/ijaaa.2021.1561},
}
@Article{Con21,
author = {Conoci, Stefano and Di Sanzo, Pierangelo and Pellegrini, Alessandro and Ciciani, Bruno and Quaglia, Francesco},
title = {On Power Capping and Performance Optimization of Multi-threaded Applications},
journal = {Concurrency and Computation: Practice and Experience},
year = {2021},
month = jan,
volume = {33},
number = {11},
abstract = {Multi-threaded applications facilitate the exploitation of the computing power of multicore architectures. On the other hand, these applications can become extremely energy-intensive, in contrast with the need for limiting the energy usage of computing systems.
In this article, we explore the design of techniques enabling multi-threaded applications to maximize their performance under a power cap. We consider two control parameters: the number of cores used by the application, and the core power state. We target the design of an auto-tuning power-capping technique with minimal intrusiveness and high portability, which is agnostic about the workload profile of the application. We investigate two different approaches for building the strategy for selecting the best configuration of the parameters under control, namely a heuristic approach and a model-based approach. Through an extensive experimental study, we evaluate the effectiveness of the proposed technique considering two different selection strategies, and we compare them with existing solutions.},
publisher = {Wiley},
series = {CCPE},
doi = {10.1002/cpe.6205},
}
@article{DiS21,
author = {Di Sanzo, Pierangelo and Avresky, Dimiter R. and Pellegrini, Alessandro},
title = {Autonomic Rejuvenation of Cloud Applications as a Countermeasure to Software Anomalies},
journal = {Software: Practice and Experience},
abstract = {Failures in computer systems can be often tracked down to software anomalies of various kinds. In many scenarios, it could be difficult, unfeasible, or unprofitable to carry out extensive debugging activity to spot the causes of anomalies and remove them. In other cases, taking corrective actions may led to undesirable service downtime. In this article, we propose an alternative approach to cope with the problem of software anomalies in cloud-based applications, and we present the design of a distributed autonomic framework that implements our approach. It exploits the elastic capabilities of cloud infrastructures, and relies on machine learning models, proactive rejuvenation techniques and a new load balancing approach. By putting together all these elements, we show that it is possible to improve both availability and performance of applications deployed over heterogeneous cloud regions and subject to frequent failures. Overall, our study demonstrates the viability of our approach, thus opening the way towards it adoption, and encouraging further studies and practical experiences to evaluate and improve it.},
year = {2021},
month = jan,
volume = {51},
number = {1},
pages = {46--71},
issn = {1097-024X},
publisher = {Wiley},
series = {SPE},
doi = {10.1002/spe.2908}
}
@Article{Pell20d,
author = {Pellegrini, Alessandro},
title = {Replication of Computational Results Report for “Green Simulation with Database Monte Carlo”},
journal = {ACM Transactions on Modeling and Computer Simulation},
year = {2020},
issn = {1049-3301},
month = {12},
volume = {31},
number = {1},
abstract = {This article presents the reproducibility results associated with the article ``Green Simulation with Database Monte Carlo'' by Mingbin Feng and
Jeremy Staum. The authors have uploaded their artifact to Zenodo, which ensures a long-term retention of the artifact. The artifact, which is based on a set of R scripts, allows to easily regenerate data for the figures and the tables, it completes successfully, and allows to reproduce all the experimental results in the article.
The article can thus receive the Artifacts Available, the Artifacts Evaluated---Functional, and the Results Reproduced badges.},
publisher = {ACM},
series = {TOMACS},
doi = {10.1145/3426823},
}
@article{Pell20c,
author = {Pellegrini, Alessandro and Di Sanzo, Pierangelo and Bevilacqua, Beatrice and Duca, Gabriella and Pascarella, Domenico and Palumbo, Roberto and Ramos, Juan José and Piera, Miquel Àngel and Gigante, Gabriella},
title = {Simulation-based Evolutionary Optimization of Air Traffic Management},
journal = {IEEE Access},
abstract = {In the context of aerospace engineering, the optimization of processes often may require to solve multi-objective optimization problems, including mixed variables, multi-modal and non-differentiable quantities, possibly involving highly-expensive objective function evaluations. In Air Traffic Management (ATM), the optimization of procedures and protocols becomes even more complicated, due to the involvement of human controllers, which act as final decision points in the control chain.
In this article, we propose the use of computational intelligence techniques, such as Agent-Based Modelling and Simulation (ABMS) and Evolutionary Computing (EC), to design a simulation-based distributed architecture to optimize control plans and procedures in the context of ATM. We rely on Agent-Based fast-time simulations to carry out offline what-if analysis of multiple scenarios, also taking into account human-related decisions, during the strategic or pre-tactical phases. The scenarios are constructed using real-world traffic data traces, while multiple optimization variables governed by an EC algorithm allow to explore the search space to identify the best solutions. Our optimization approach relies on ad-hoc multi-objective performance metrics which allow to assess the goodness of the control of aircraft and air traffic regulations.
We present experimental results which prove the viability of our approach, comparing them with real-world data traces, and proving their meaningfulness from an Air Traffic Control perspective.},
year = {2020},
issn = {2169-3536},
month = sep,
volume = {8},
pages = {161551--161570},
publisher = {IEEE},
series = {Access},
doi = {10.1109/ACCESS.2020.3021192}
}
@Article{Mar20,
author = {Marotta, Romolo and Tiriticco, Davide and Di Sanzo, Pierangelo and Pellegrini, Alessandro and Ciciani, Bruno and Quaglia, Francesco},
title = {Mutable Locks: Combining the Best of Spin and Sleep Locks},
journal = {Concurrency and Computation: Practice and Experience},
year = {2020},
volume = {32},
number = {22},
issn = {1532-0634},
month = {6},
abstract = {In this article we present Mutable Locks, a synchronization construct with the same semantic of traditional locks (such as spin locks or sleep locks), but with a self-tuned optimized trade off between responsiveness and CPU-time usage during threads’ wait phases. Mutable locks tackle the need for efficient synchronization supports in the era of multi-core machines, where the run-time performance should be optimized while reducing resource usage. This goal should be achieved with no intervention by the programmers. Our proposal is intended for exploitation in generic concurrent applications, where scarce or no knowledge is available about the underlying software/hardware stack and the workload. This is an adverse scenario for static choices between spinning and sleeping, which is tackled by our mutable locks thanks to their hybrid waiting phase and self-tuning capabilities.},
publisher = {Wiley},
series = {CCPE},
doi = {10.1002/CPE.5858}
}
@Article{Pri20,
author = {Principe, Matteo and Tocci, Tommaso and Di Sanzo, Pierangelo and Quaglia, Francesco and Pellegrini, Alessandro},
title = {A Distributed Shared-Memory Middleware for Speculative Parallel Discrete Event Simulation},
journal = {ACM Transactions on Modeling and Computer Simulation},
year = {2020},
volume = {30},
number = {2},
issn = {1049-3301},
pages = {11:1--11:26},
month = feb,
abstract = {The large diffusion of multi-core machines has pushed the research in the field of Parallel Discrete Event Simulation (PDES) towards new programming paradigms, based on the exploitation of shared memory. On the opposite side, the advent of Cloud computing—and the possibility to group together many (low-cost) virtual machines to form a distributed-memory cluster capable of hosting simulation applications—has raised the need to bridge shared-memory programming and seamless distributed execution. In this article, we present the design of a distributed middleware that transparently allows a PDES application coded for shared memory systems to run on clusters of (Cloud) resources. Our middleware is based on a synchronization protocol called Event & Cross State (ECS) Synchronization. It allows cross-simulation-object access by event handlers, thus representing a powerful tool for the development of various types of PDES applications. We also provide data for an experimental assessment of our middleware architecture, which has been integrated into the open source ROOT-Sim speculative PDES platform.},
doi = {10.1145/3373335},
publisher = {ACM},
series = {TOMACS},
}
@article{Qua20,
author = {Quaglia, Francesco and Theodoropoulos, Georgios and Pellegrini, Alessandro},
title = {Editorial to the Special Issue on the Principles of Advanced Discrete Simulation (PADS)},
journal = {Transactions on Modeling and Computer Simulations},
year = {2020},
month = mar,
number = {5},
pages = {8:1--8:2},
volume = {69},
doi = {10.1145/3381903}
}
@Article{DiS19,
author = {Di Sanzo, Pierangelo and Pellegrini, Alessandro and Sannicandro, Marco and Ciciani, Bruno and Quaglia, Francesco},
title = {Adaptive Model-based Scheduling in Software Transactional Memory},
journal = {IEEE Transactions on Computers},
year = {2020},
volume = {69},
number = {5},
pages = {621--632},
issn = {0018-9340},
month = {5},
abstract = {Software Transactional Memory (STM) stands as powerful concurrent programming paradigm, enabling atomicity and isolation while accessing shared data. On the downside, STM may suffer from performance degradation due to excessive conflicts among concurrent transactions, which cause waste of CPU-cycles and energy because of transaction aborts. An approach to cope with this issue consists of putting in place smart scheduling strategies which temporarily suspend the execution of some transaction in order to reduce the transaction conflict rate. In this article, we present an adaptive model-based transaction scheduling technique relying on a Markov Chain-based performance model of STM systems. Our scheduling technique is adaptive in a twofold sense: (i) it controls the execution of transactions depending on throughput predictions by the model as a function of the current system state, (ii) it re-tunes on-line the Markov Chain-based model to adapt it—and the outcoming transaction scheduling decisions—to dynamic variations of the workload. We have been able to achieve the latter target thanks to the fact that our performance model is extremely lightweight. In fact, to be recomputed, it requires a reduced set of input parameters, whose values can be estimated via a few on-line samples related to the current workload dynamics. We also present a scheduler that implements our adaptive technique, which we integrated within the open source TinySTM package. Further, we report the results of an experimental study based on the STAMP benchmark suite, which has been aimed at assessing both the accuracy of our performance model in predicting the actual system throughput and the advantages of the adaptive scheduling policy over literature techniques},
doi = {10.1109/TC.2019.2954139},
publisher = {IEEE},
series = {TC},
}
@Article{Pell19,
author = {Pellegrini, Alessandro and Quaglia, Francesco},
title = {Cross-State Events: a New Approach to Parallel Discrete Event Simulation and its Speculative Runtime Support},
journal = {Journal of Parallel and Distributed Computing},
year = {2019},
issn = {0743-7315},
month = {10},
pages = {48--68},
volume = {132},
abstract = {We present a new approach to Parallel Discrete Event Simulation (PDES), where we enable the execution of so-called cross-state events. During their processing, the state of multiple concurrent simulation objects can be accessed in read/write mode, as opposed to classical partitioned accesses. This is done with no pre-declaration of this type of access by the programmer, hence also coping with non-determinism. In our proposal, cross-state events are supported by a speculative runtime environment fully transparently to the application code. This is done through an ad-hoc memory management architecture and an extension of the classical Time Warp synchronization protocol. This extension, named Event and Cross-State (ECS) synchronization, ensures causally-consistent speculative parallel execution of discrete event applications by allowing all events to observe the snapshot of the model execution trajectory that would have been observed in a timestamp-ordered execution of the same model. An experimental assessment of our proposal shows how it can significantly reduce the application development complexity, while also providing advantages in terms of performance},
doi = {10.1016/j.jpdc.2019.05.003},
publisher = {Elsevier},
series = {JPDC},
}
@Article{Ian19,
author = {Ianni, Mauro and Pellegrini, Alessandro and Quaglia, Francesco},
title = {Anonymous Readers Counting: A Wait-free Multi-word Atomic Register Algorithm for Scalable Data Sharing on Multi-core Machines},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = {2019},
issn = {1045-9219},
month = {2},
pages = {286--299},
volume = {30},
abstract = {In this article we present Anonymous Readers Counting (ARC), a multi-word atomic (1,N) register algorithm for multi-core machines. ARC exploits Read-Modify-Write (RMW) instructions to coordinate the writer and reader threads in a wait-free manner and enables large-scale data sharing by admitting up to (2^32 - 2) concurrent readers on off-the-shelf 64-bits machines, as opposed to the most advanced RMW-based approach which is limited to 58 readers on the same kind of machines. Further, ARC avoids multiple copies of the register content when accessing it—this is a problem that affects classical register algorithms based on atomic read/write operations on single words. Thus it allows for higher scalability with respect to the register size. Moreover, ARC explicitly reduces the overall energy consumption, via a proper limitation of RMW instructions in case of read operations re-accessing a still-valid snapshot of the register content, and by showing constant time for read operations and amortized constant time for write operations. Our proposal has therefore a strong focus on real-world off-the-shelf architectures, allowing us to capture properties which benefit both performance and energy consumption. A proof of correctness of our register algorithm is also provided, together with experimental data for a comparison with literature proposals. Beyond assessing ARC on physical platforms, we carry out as well an experimentation on virtualized infrastructures, which shows the resilience of wait-free synchronization as provided by ARC with respect to CPU-steal times, proper of modern paradigms such as cloud computing. Finally, we discuss how to extend ARC for scenarios with multiple writers and multiple readers—the so called (M,N) register. This is achieved not by changing the operations (and their wait-free nature) executed along the critical path of the threads, rather only changing the ratio between the number of buffers keeping the register snapshots and the number of threads to coordinate, as well as the number of bits used for counting readers within a 64-bit mask accessed via RMW instructions—just depending on the target balance between the number of readers and the number of writers to be supported.},
doi = {10.1109/TPDS.2018.2865932},
publisher = {IEEE},
series = {TPDS},
}
@Article{Cin17b,
author = {Cingolani, Davide and Pellegrini, Alessandro and Quaglia, Francesco},
title = {Transparently Mixing Undo Logs and Software Reversibility for State Recovery in Optimistic PDES},
journal = {ACM Transactions on Modeling and Computer Simulation},
year = {2017},
issn = {0885-7458},
month = may,
number = {2},
pages = {11:1--11:26},
volume = {27},
abstract = {The Time Warp synchronization protocol for Parallel Discrete Event Simulation (PDES) is universally considered a viable solution to exploit the intrinsic simulation model parallelism and to provide model execution speedup. Yet it leads the PDES system to execute events in an order that may generate causal inconsistencies that need to be recovered via rollback, which requires restoration of a previous (consistent) simulation state whenever a causality violation is detected. The rollback operation is so critical for the performance of a Time Warp system that it has been extensively studied in the literature for decades to find approaches suitable to optimize it. The proposed solutions can be roughly classified as based on either checkpointing or reverse computing. In this article, we explore the practical design and implementation of a fully new approach based on the runtime generation of so-called undo code blocks, which are blocks of instructions implementing the reverse memory side effects generated by the forward execution of the events. However, this is not done by recomputing the original values to be restored, as instead it occurs in reverse computing schemes. Hence, the philosophy undo code blocks rely on is similar in spirit to that of undo-logs (as a form of checkpointing). Nevertheless, they are not data logs (as instead checkpoints are); rather, they are logs of instructions. Our proposal is fully transparent, thanks to the reliance on static software instrumentation (targeting the x86 architecture and Linux systems). Also, as we show, it can be combined with classical checkpointing to further improve the runtime behavior of the state recoverability support as a function of the workload. We also present experimental results related to our implementation, which is released as free software and fully integrated into the open source ROOT-Sim package. Experimental data support the viability and effectiveness of our proposal.},
doi = {10.1145/3077583},
publisher = {ACM Press},
series = {TOMACS},
}
@Article{Pell17b,
author = {Pellegrini, Alessandro and Quaglia, Francesco},
title = {A Fine-grain Time-sharing Time Warp System},
journal = {ACM Transactions on Modeling and Computer Simulation},
year = {2017},
issn = {0885-7458},
month = may,
number = {2},
volume = {27},
abstract = {Several techniques have been proposed to improve the performance of Parallel Discrete Event Simulation platforms relying on the Time Warp (optimistic) synchronization protocol. Among them we can mention optimized approaches for state restore, as well as techniques for load balancing or (dynamically) controlling the speculation degree, the latter being specifically targeted at reducing the incidence of causality errors leading to waste of computation. However, in state-of-the-art Time Warp systems, events’ processing is not preemptable, which may prevent the possibility to promptly react to the injection of higher priority (say, lower timestamp) events. Delaying the processing of these events may, in turn, give rise to higher incidence of incorrect speculation. In this article, we present the design and realization of a fine-grain time-sharing Time Warp system, to be run on multi-core Linux machines, which makes systematic use of event preemption in order to dynamically reassign the CPU to higher priority events/tasks. Our proposal is based on a truly dual mode execution, application versus platform, which includes a timer-interrupt-based support for bringing control back to platform mode for possible CPU reassignment according to very fine grain periods. The latter facility is offered by an ad-hoc timer-interrupt management module for Linux, which we release, together with the overall time-sharing support, within the open source ROOT-Sim platform. An experimental assessment based on the classical PHOLD benchmark and two real-world models is presented, which shows how our proposal effectively leads to the reduction of the incidence of causality errors, especially when running with higher degrees of parallelism.},
doi = {10.1145/3013528},
publisher = {ACM Press},
series = {TOMACS},
}
@Article{Pell16b,
author = {Pellegrini, Alessandro and Peluso, Sebastiano and Quaglia, Francesco and Vitali, Roberto},
title = {Transparent Speculative Parallelization of Discrete Event Simulation Applications Using Global Variables},
journal = {International Journal of Parallel Programming},
year = {2016},
issn = {0885-7458},
month = dec,
number = {6},
pages = {1200--1247},
volume = {44},
abstract = {Parallelizing (compute-intensive) discrete event simulation (DES) applications is a classical approach for speeding up their execution and for making very large/complex simulation models tractable. This has been historically achieved via parallel DES (PDES) techniques, which are based on partitioning the simulation model into distinct simulation objects (somehow resembling objects in classical object-oriented programming), whose states are disjoint, which are executed concurrently and rely on explicit event-exchange (or event-scheduling) primitives as the means to support mutual dependencies and notification of their state updates. With this approach, the application developer is necessarily forced to reason about state separation across the objects, thus being not allowed to rely on shared information, such as global variables, within the application code. This implicitly leads to the shift of the user-exposed programming model to one where sequential-style global variable accesses within the application code are not allowed. In this article we remove this limitation by providing support for managing global variables in the context of DES code developed in ANSI-C, which gets automatically parallelized. Particularly, we focus on speculative (also termed optimistic) PDES systems that run on top of multi-core machines, where simulation objects can concurrently process their events with no guarantee of causal consistency and actual violations of causality rules are recovered through rollback/recovery schemes. In compliance with the nature of speculative processing, in our proposal global variables are transparently mapped to multi-versions, so as to avoid any form of safety predicate verification upon their updates. Consistency is ensured via the introduction of a new rollback/recovery scheme based on detecting global variables’ reads on non-correct versions. At the same time, efficiency in the execution is guaranteed by managing multi-version variables’ lists via non-blocking algorithms. Furthermore, the whole approach is fully transparent, being it based on automatized instrumentation of the application software (particularly ELF objects). Hence the programmer is exposed to the classical (and easy to code) sequential-style programming scheme while accessing any global variable. An experimental assessment of our proposal, based on a suite of case study applications, run on top of an off-the-shelf Linux machine equipped with 32 CPU-cores and 64 GB of RAM, is also presented.},
doi = {10.1007/s10766-016-0429-2},
publisher = {Springer Verlag},
series = {IJPP},
}
@Article{DiS15,
author = {Di Sanzo, Pierangelo and Quaglia, Francesco and Ciciani, Bruno and Pellegrini, Alessandro and Didona, Diego and Romano, Paolo and Palmieri, Roberto and Peluso, Sebastiano},
title = {A Flexible Framework for Accurate Simulation of Cloud In-Memory Data Stores},
journal = {Simulation Modelling Practice and Theory},
year = {2015},
issn = {1569-190X},
month = jul,
number = {2},
pages = {219--238},
volume = {58},
abstract = {In-memory (transactional) data stores, also referred to as data grids, are recognized as a first-class data management technology for cloud platforms, thanks to their ability to match the elasticity requirements imposed by the pay-as-you-go cost model. On the other hand, determining how performance and reliability/availability of these systems vary as a function of configuration parameters, such as the amount of cache servers to be deployed, and the degree of in-memory replication of slices of data, is far from being a trivial task. Yet, it is an essential aspect of the provisioning process of cloud platforms, given that it has an impact on the amount of cloud resources that are planned for usage. To cope with the issue of predicting/analysing the behavior of different configurations of cloud in-memory data stores, in this article we present a flexible simulation framework offering skeleton simulation models that can be easily specialized in order to capture the dynamics of diverse data grid systems, such as those related to the specific (distributed) protocol used to provide data consistency and/or transactional guarantees. Besides its flexibility, another peculiar aspect of the framework lies in that it integrates simulation and machine-learning (black-box) techniques, the latter being used to capture the dynamics of the data-exchange layer (e.g. the message passing layer) across the cache servers. This is a relevant aspect when considering that the actual data-transport/networking infrastructure on top of which the data grid is deployed might be unknown, hence being not feasible to be modeled via white-box (namely purely simulative) approaches. We also provide an extended experimental study aimed at validating instances of simulation models supported by our framework against execution dynamics of real data grid systems deployed on top of either private or public cloud infrastructures. Particularly, our validation test-bed has been based on an industrial-grade open-source data grid, namely Infinispan by JBoss/Red-Hat, and a de-facto standard benchmark for NoSQL platforms, namely YCSB by Yahoo. The validation study has been conducted by relying on both public and private cloud systems, scaling the underlying infrastructure up to 100 (resp. 140) Virtual Machines for the public (resp. private) cloud case. Further, we provide some experimental data related to a scenario where our framework is used for on-line capacity planning and reconfiguration of the data grid system.},
doi = {10.1016/j.simpat.2015.05.011},
publisher = {Elsevier},
series = {SIMPAT},
}
@Article{Pell14d,
author = {Pellegrini, Alessandro and Vitali, Roberto and Quaglia, Francesco},
title = {Autonomic State Management for Optimistic Simulation Platforms},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = {2015},
issn = {1045-9219},
month = jun,
number = {6},
pages = {1560--1569},
volume = {26},
abstract = {We present the design and implementation of an autonomic state manager (ASM) tailored for integration within optimistic parallel discrete event simulation (PDES) environments based on the C programming language and the executable and linkable format (ELF), and developed for execution on ×86_64 architectures. With ASM, the state of any logical process (LP), namely the individual (concurrent) simulation unit being part of the simulation model, is allowed to be scattered on dynamically allocated memory chunks managed via standard API (e.g., malloc/free). Also, the application programmer is not required to provide any serialization/ deserialization module in order to take a checkpoint of the LP state, or to restore it in case a causality error occurs during the optimistic run, or to provide indications on which portions of the state are updated by event processing, so to allow incremental checkpointing. All these tasks are handled by ASM in a fully transparent manner via (A) runtime identification (with chunk-level granularity) of the memory map associated with the LP state, and (B) runtime tracking of the memory updates occurring within chunks belonging to the dynamic memory map. The co-existence of the incremental and non-incremental log/restore modes is achieved via dual versions of the same application code, transparently generated by ASM via compile/link time facilities. Also, the dynamic selection of the best suited log/ restore mode is actuated by ASM on the basis of an innovative modeling/optimization approach which takes into account stability of each operating mode with respect to variations of the model/environmental execution parameters.},
doi = {10.1109/TPDS.2014.2323967},
publisher = {IEEE Computer Society},
series = {TPDS},
}
@Article{Vit12e,
author = {Vitali, Roberto and Pellegrini, Alessandro and Quaglia, Francesco},
title = {Load sharing for optimistic parallel simulations on multi core machines},
journal = {SIGMETRICS Performance Evaluation Review},
year = {2012},
issn = {0163-5999},
month = aug,
number = {3},
pages = {2--11},
volume = {40},
abstract = {Parallel Discrete Event Simulation (PDES) is based on the partitioning of the simulation model into distinct Logical Processes (LPs), each one modeling a portion of the entire system, which are allowed to execute simulation events concurrently. This allows exploiting parallel computing architectures to speedup model execution, and to make very large models tractable. In this article we cope with the optimistic approach to PDES, where LPs are allowed to concurrently process their events in a speculative fashion, and rollback/ recovery techniques are used to guarantee state consistency in case of causality violations along the speculative execution path. Particularly, we present an innovative load sharing approach targeted at optimizing resource usage for fruitful simulation work when running an optimistic PDES environment on top of multi-processor/multi-core machines. Beyond providing the load sharing model, we also define a load sharing oriented architectural scheme, based on a symmetric multi-threaded organization of the simulation platform. Finally, we present a real implementation of the load sharing architecture within the open source ROme OpTimistic Simulator (ROOT-Sim) package. Experimental data for an assessment of both viability and effectiveness of our proposal are presented as well.},
doi = {10.1145/2425248.2425250},
issue_date = {December 2012},
numpages = {10},
publisher = {ACM},
series = {PER},
}
%%%%% Conference Proceedings %%%%%
@InProceedings{Mar24b,
author = {Marotta, Romolo and Pellegrini, Alessandro},
booktitle = {Proceedings of the 2024 Winter Simulation Conference},
title = {Model-Driven Engineering for High-Performance Parallel Discrete Event Simulations on Heterogeneous Architectures},
year = {2024},
month = dec,
publisher = {IEEE},
series = {WSC},
abstract = {Modern high-performance, large-scale simulations require significant computational power, memory, and storage, making heterogeneous architectures an attractive option. Yet, the presence of accelerators in heterogeneous architectures makes model development hard. Domain-specific languages (DSLs) have successfully simplified model development, but designing a DSL to target heterogeneous architectures can be burdensome. Model-driven engineering (MDE) can simplify the development of DSLs targeting heterogeneous architectures. In this paper, we present a model-driven approach targeting Parallel Discrete Event Simulations on heterogeneous architectures. We exercise our MDE-generated models using a state-of-the-art runtime environment for heterogeneous architectures, using a custom DSL as an example.},
}
@inproceedings{Qua24,
author = {Quaglia, Francesco},
title = {PARSIR: a Package for Effective Parallel Discrete Event Simulation on Multi-processor Machines},
booktitle = {Proceedings of the 28th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2024},
month = oct,
publisher = {IEEE},
series = {DS-RT},
location = {Urbino, Italy},
note = {To appear}
}
@InProceedings{And24,
author = {Andelfinger, Philipp and Pellegrini, Alessandro and Marotta, Romolo},
title = {Sampling Policies for Near-Optimal Device Choice in Parallel Simulations on CPU/GPU Platforms},
booktitle = {Proceedings of the 28th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2024},
month = oct,
publisher = {IEEE},
series = {DS-RT},
location = {Urbino, Italy},
}
@InProceedings{Du24,
author = {Du, Xiaorui and Piccione, Andrea and Pimpini, Adriano and Bortoli, Stefano and Pellegrini, Alessandro and Knoll, Alois},
title = {Online Analytics with Local Operator Rebinding for Simulation Data Stream Processing},
booktitle = {Proceedings of the 28th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2024},
month = oct,
publisher = {IEEE},
series = {DS-RT},
location = {Urbino, Italy},
}
@InProceedings{Pic24,
author = {Piccione, Andrea and Pellegrini, Alessandro},
booktitle = {Proceedings of the 2024 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Efficient Non-Blocking Event Management for Speculative Parallel~Discrete~Event~Simulation},
year = {2024},
month = jun,
publisher = {ACM},
series = {SIGSIM-PADS '24},
abstract = {Parallel Discrete Event Simulation (PDES) is a modelling technique that takes advantage of concurrent computing resources. However, its asynchronous nature can present challenges for efficient execution. This paper proposes a new non-blocking management system for handling messages and anti-messages in Time Warp simulations. This approach exploits the benefits of non-blocking algorithms to surpass the limitations of existing blocking mechanisms, resulting in more efficient and scalable simulations. Specifically, the approach relies on efficient atomic fetch-and-add operations provided by modern computer architectures for evaluating and updating the status of the event.},
doi = {10.1145/3615979.3656053},
location = {Atlanta, GA, USA},
badges = {available,functional,reproduced},
}
@InProceedings{Mar24,
author = {Marotta, Romolo and Pellegrini, Alessandro and Andelfinger, Philipp},
booktitle = {Proceedings of the 2024 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Follow the Leader: Alternating CPU/GPU Computations in PDES},
year = {2024},
month = jun,
publisher = {ACM},
series = {SIGSIM-PADS '24},
abstract = {Despite the successes of graphics processing units (GPUs) in accelerating simulations in several research fields, their use is largely restricted to domain-specific workloads that consistently offer the large degree of inherent parallelism and computational intensity at which GPUs excel.
When targeting generic discrete-event simulations, whose dynamics can vary wildly over time, a static choice between a GPU-based and traditional CPU-based execution is likely to be suboptimal.
Here, we explore a parallel discrete-event (PDES) execution scheme for CPU-GPU platforms that aims to approximate an optimal dynamic device choice.
Starting from an intermediate model state, a current "leader" device running the simulation is periodically challenged by a brief concurrent run on another device starting from an intermediate model state.
Based on the gathered performance measurements, a forecasting scheme determines the leader for the next period.
The execution time and power consumption of this scheme hinge on 1) an efficient mechanism for providing the "follower" device with a consistent model state, and 2) robust performance forecasting to justify the device choices.
We present these building blocks, their implementation combining the existing CPU and GPU simulators ROOT-Sim and GPUTW, and measurement results demonstrating substantially reduced execution time without increasing energy consumption over a static device choice.},
doi = {10.1145/3615979.3656056},
location = {Atlanta, GA, USA},
badges = {available,functional,reproduced},
}
@InProceedings{DuX24,
author = {Du, Xiaorui and Piccione, Andrea and Pimpini, Adriano and Bortoli, Stefano and Knoll, Alois and Pellegrini, Alessandro},
booktitle = {Proceedings of the 24th International Symposium on Cluster, Cloud and Grid Computing},
title = {HUILLY: A Non-Blocking Ingestion Buffer for Timestepped Simulation Analytics},
year = {2024},
month = may,
publisher = {IEEE},
series = {CCGrid},
}
@InProceedings{DuX23,
author = {Du, Xiaorui and Pimpini, Adriano and Piccione, Andrea and Meng, Zhuoxiao and Siguenza-Torres, Anibal and Bortoli, Stefano and Knoll, Alois and Pellegrini, Alessandro},
booktitle = {Proceedings of the 2023 Winter Simulation Conference},
title = {Autonomic Orchestration of In-situ and In-transit Data Analytics for Simulation Studies},
year = {2023},
month = dec,
publisher = {IEEE},
series = {WSC},
abstract = {Modern parallel/distributed simulations can produce large amounts of data. The historical approach of performing analyses at the end of the simulation is unlikely to cope with modern, extremely large-scale analytics jobs. Indeed, the I/O subsystem can quickly become the global bottleneck. Similarly, processing on-the-fly the data produced by simulations can significantly impair the performance in terms of computational capacity and network load.
We present a methodology and reference architecture for constructing an autonomic control system to determine at runtime the best placement for data processing (on simulation nodes or a set of external nodes). This allows for a good tradeoff between the load on the simulation's critical path and the data communication system. Our preliminary experimentation shows that autonomic orchestration is crucial to improve the global performance of a data analysis system, especially when the simulation node's rate of data production varies during simulation.},
}
@InProceedings{Pic23c,
author = {Piccione, Andrea and Pellegrini, Alessandro},
title = {Practical Tie Breaking for Parallel/Distributed Simulations},
booktitle = {Proceedings of the 27th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2023},
month = oct,
publisher = {IEEE},
series = {DS-RT},
abstract = {In this paper, we discuss a tie-breaking strategy based on a bitwise comparison of event payload that allows parallel and distributed discrete-event simulations to observe a deterministic order in the execution of events, even in the presence of event ties. This approach provides practical usability whenever model-assisted tie-breaking is unavailable, thus ensuring that multiple simulation executions provide deterministic behaviour and repeatable results. Moreover, it ensures that the selected order of events is also consistent with sequential executions. We discuss the theory behind this strategy and experimentally show that the performance drop is imputable to event queue management when relying on tie-breaking strategies like the ones discussed in this work.},
location = {Singapore},
note = {Winner of the Best Paper Award},
}
@InProceedings{Mar23,
author = {Marotta, Romolo and Montesano, Federica and Pellegrini, Alessandro and Quaglia, Francesco},
title = {Incremental Checkpointing of Large State Simulation Models with Write-Intensive Events via Memory Update Correlation on Buddy Pages},
booktitle = {Proceedings of the 27th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2023},
month = oct,
publisher = {IEEE},
series = {DS-RT},
abstract = {Checkpointing techniques for speculative parallel simulation of discrete event models have been widely studied in the literature. However, there has been a very marginal attempt to exploit operating system page-protection services, which have instead been largely exploited in the context of checkpointing for fault tolerance. In this article, we discuss how these services can effectively manage simulation models with large states and write-intensive events on zones in the state layout. In particular, we present a solution where the correlation of write operations on buddy pages in the state layout can be exploited for achieving effective incremental checkpointing support, which allows scaling down the costs of operating system services. Our solution does not require any instrumentation of the simulation application code and is usable on any Posix-compliant operating system. We also discuss its integration within the USE (Ultimate-Share-Everything) open-source speculative simulation package and report some experimental data for its assessment.},
location = {Singapore},
note = {Shortlisted for the Best Paper Award},
}
@InProceedings{Pic23b,
author = {Piccione, Andrea and Andelfinger, Philipp and Pellegrini, Alessandro},
booktitle = {Proceedings of the 2023 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Hybrid Speculative Synchronisation for Parallel Discrete Event Simulation},
year = {2023},
month = jun,
publisher = {ACM},
series = {SIGSIM-PADS '23},
abstract = {Parallel discrete-event simulation (PDES) is a well-established family of methods to accelerate discrete-event simulations. However, the available algorithms vary substantially in the performance achievable for different simulation models, largely preventing generic solutions applicable by modellers without expert knowledge. For instance, in Time Warp, the processing elements execute events asynchronously and speculatively with high aggressiveness, leading to frequent and costly rollbacks if misspeculations occur often. In contrast, synchronous approaches such as the new Window Racer algorithm exhibit a more cautious form of speculation. In the present paper, we combine these two fundamentally different algorithms within a single runtime environment, allowing for a choice of the best algorithm for different model segments. We describe the architecture and the algorithmic considerations to support the efficient coexistence and interaction of the algorithms without violating the correctness of the simulation. Our experiments using a synthetic benchmark and an epidemics model show that the hybrid algorithm is less sensitive to its configuration and can deliver substantially higher performance in models with varying degrees of coupling among entities compared to each algorithm on its own.},
doi = {10.1145/3573900.3591124},
location = {Orlando, FL, USA},
badges = {available,reusable,reproduced},
}
@InProceedings{Pic23,
author = {Piccione, Andrea and Bernardinetti, Giorgio and Pellegrini, Alessandro and Bianchi, Giuseppe},
title = {Is Your Smartphone Really Safe? A Wake-up Call on Android Antivirus Software Effectiveness},
booktitle = {Proceedings of the 2023 Italian Conference on Cybersecurity},
abstract = {A decade ago, researchers raised severe concerns about Android smartphones' security by extensively assessing and recognising the limitations of Android antivirus software. Considering the significant increase in the economic role of smartphones in recent years, we would expect that security measures are significantly improved by now. To test this assumption, we conducted a relatively extensive study to evaluate the effectiveness of off-the-shelf antivirus software in detecting malicious applications injected into legitimate Android applications.
We specifically repackaged seven widely used Android applications with 100 obfuscated malware instances. We submitted the 700 samples to the VirusTotal web portal, testing the effectiveness of the over 70 free and commercial antiviruses available in detecting them.
For the obfuscation part, we intentionally employed publicly available tools that could be used by ``just” a tech-savvy adversary. We used a combination of well-known and novel (but still simple) obfuscation techniques. Surprisingly (or perhaps unsurprisingly?), our findings indicate that almost 76\% of the samples went utterly undetected. Even when our samples were detected, this occurred for a handful (never more than 4) of Android antivirus software available on VirusTotal. This lack of awareness of the effectiveness of Android antivirus is critical because the false sense of security given by antivirus software could prompt users to install applications from untrusted sources, allowing attackers to install a persistent threat within another application easily.},
year = {2023},
month = may,
publisher = {CEUR-WS.org},
series = {ITASEC},
location = {Bari, Italy},
}
@InProceedings{And22,
author = {Andelfinger, Philipp and Piccione, Andrea and Pellegrini, Alessandro and Uhrmacher, Adelinde},
title = {Comparing Speculative Synchronization Algorithms for Continuous-Time Agent-Based Simulations},
booktitle = {Proceedings of the 26th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2022},
month = sep,
publisher = {IEEE},
series = {DS-RT},
abstract = {Continuous-Time agent-based models often represent tightly-coupled systems where agents' state transitions occur in close interaction with neighbouring agents. Without artificial discretization, the potential for near-instantaneous propagation of effects across the model challenges their parallel execution. Although existing algorithms can tackle the largely unpredictable nature of such simulations through speculative execution, they are subject to trade-offs concerning the degree of optimism, the probability and cost of rollbacks, and locality exploitation. This paper aims to understand the suitability of asynchronous and synchronous parallel simulation algorithms when executing continuous-time agent-based models with rate-driven stochastic transitions. We present extensive measurement results comparing optimized implementations under various configurations of a parametrizable simulation model of the epidemic spread of disease. Our results show that the amount of locality in the agent interactions is the decisive factor for the relative performance of the approaches. We identify remaining hurdles for higher simulation performance with the two classes of algorithms and outline potential refinements based on profiling results.},
location = {Alès, France},
note = {Winner of the Best Paper Award},
}
@InProceedings{Pim22b,
author = {Pimpini, Adriano and Piccione, Andrea and Pellegrini, Alessandro},
title = {On the Accuracy and Performance of Spiking Neural Network Simulations},
booktitle = {Proceedings of the 26th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
year = {2022},
month = sep,
publisher = {IEEE},
series = {DS-RT},
abstract = {Spiking Neural Networks (SNNs) are a class of Artificial Neural Networks that show a time behaviour that cannot be computed with single one-shot functions. Therefore, to study their evolution over time, simulations are typically employed. Typical simulation approaches rely on time stepped simulations, while more recent works have highlighted the opportunity to rely on Parallel Discrete Event Simulation (PDES) for improved accuracy. In particular, Speculative PDES has been shown to be a suitable simulation paradigm to deal with the peculiar temporal domain of SNNs. In this paper, we perform an experimental evaluation of these two different approaches, showing the implications on both simulation performance and accuracy. Our assessment showcases that Parallel Discrete Event Simulation can deliver good scaling on parallel architectures while offering more accurate results.},
location = {Alès, France},
note = {Shortlisted for the Best Paper Award},
}
@InProceedings{Pim22,
author = {Pimpini, Adriano and Piccione, Andrea and Ciciani, Bruno and Pellegrini, Alessandro},
booktitle = {Proceedings of the 2022 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Speculative Distributed Simulation of Very Large Spiking Neural Networks},
year = {2022},
month = jun,
publisher = {ACM},
series = {PADS},
abstract = {Spiking Neural Networks are a class of Artificial Neural Networks that closely mimic biological neural networks. They are particularly interesting because of their potential to advance research in several fields, both because of better insights on neural behaviour (benefiting medicine, neuroscience, psychology) and the potential in Artificial Intelligence. Their ability to run on a low energy budget once implemented in hardware makes them even more appealing. However, because of their behaviour that evolves with time, when a hardware implementation is not available, their output cannot simply be computed with a one-shot function (however complex), but instead they need to be simulated.
Simulating Spiking Neural Networks is exceptionally costly, mainly due to their sheer size. Many current simulation methods have trouble scaling up on more powerful systems because of conservative synchronisation methods. Scalability is often offered through approximation of the actual results. In this paper, we present a modelling methodology and runtime-environment support adhering to the Time Warp synchronisation protocol, which enables speculative distributed simulation of Spiking Neural Network models with improved accuracy of the results. We discuss the methodological and technical aspects that will allow effective speculative simulation and present an experimental assessment on large virtualised environments, which shows the viability of simulating networks made of millions of neurons.},
doi = {10.1145/3518997.3531027},
location = {Atlanta, GA, USA},
badges = {available,reusable},
}
@InProceedings{DeA21b,
author = {De~Angelis, Emanuele and Pellegrini, Alessandro and Proietti, Maurizio},
booktitle = {Proceedings of the 2021 IEEE International Symposium on Software Reliability Engineering Workshops},
title = {Automatic Extraction of Behavioral Features for Test Program Similarity Analysis},
year = {2021},
month = oct,
publisher = {IEEE},
series = {ISSREW},
abstract = {We present a methodology for performing automatic extraction of behavioral features from test programs, that is, for collecting pieces of information about the test programs execution. These features are then exploited to carry out analysis and reasoning about test program similarity. The similarity information can be used to drive the execution of test campaigns, in the attempt to either reduce the time-to-test, or to increase the testing capabilities of a given test suite. Our methodology is embedded in the Hyperion analysis framework, which can be configured to define a wide range of test program similarity criteria.},
location = {Wuhan, China},
doi = {10.1109/ISSREW53611.2021.00054},
pages = {129--136},
}
@InProceedings{DeA21,
author = {De~Angelis, Emanuele and De~Angelis, Guglielmo and Pellegrini, Alessandro and Proietti, Maurizio},
booktitle = {Proceedings of the 15th IEEE International Conference on Service Oriented Systems Engineering},
title = {Inferring Relations Among Test Programs in Microservices Applications},
year = {2021},
month = aug,
publisher = {IEEE},
pages = {114--123},
series = {SOSE},
abstract = {The emergence of the microservices-oriented architectural style calls for novel methodologies and technological frameworks that support the design, development, and maintainance of applications structured according to that new style. In this paper, we consider the issue of designing suitable strategies for the governance and the automation of testing activities within the microservices paradigm.
We focus on the problem of discovering relations between test programs that help avoiding to re-run all the available test suites each time one of its constituents evolves.
We propose an analysis technique, based on symbolic execution of test programs, which is able to collect information about the invocations of local and remote APIs performed when running such programs.
Symbolic execution enables the analysis of sets of executions corresponding to different input data, and hence it is also suitable for parametric test programs.
The information extracted by symbolic execution is processed by a rule-based automated reasoning engine, which infers dependencies and similarities among test programs. In particular, test programs are considered similar if they involve the same microservice instance, or they connect to the same remote API, or they locally activate overlapping APIs, or they raise similar kinds of errors.
We show the viability of our approach by presenting a case study within the context of a real-world microservice application that implements an open-source educational platform.},
location = {Oxford, UK},
doi = {10.1109/SOSE52839.2021.00018},
note = {Winner of the Best Paper Award},
}
@InProceedings{Rab20,
author = {Rab, Maryan and Marotta, Romolo and Ianni, Mauro and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 24th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
title = {NUMA-Aware Non-Blocking Calendar Queue},
year = {2020},
month = sep,
publisher = {IEEE},
series = {DS-RT},
abstract = {Modern computing platforms are based on multi-processor/multi-core technology. This allows running applications with a high degree of hardware parallelism. However, medium-to-high end machines pose a problem related to the asymmetric delays threads experience when accessing shared data. Specifically, Non-Uniform-Memory-Access (NUMA) is the dominating technology---thanks to its capability for scaled-up memory bandwidth---which however imposes asymmetric distances between CPU-cores and memory banks, making an access by a thread to data placed on a far NUMA node severely impacting performance. In this article, we tackle this problem in the context of shared event-pool management, a relevant aspect in many fields, like parallel discrete event simulation. Specifically, we present a NUMA-aware calendar queue, which also has the advantage of making concurrent threads coordinate via a non-blocking scalable approach. Our proposal is based on work deferring combined with dynamic re-binding of the calendar queue operations (insertions/extractions) to the best suited among the concurrent threads hosted by the underlying computing platform. This changes the locality of the operations by threads in a way positively reflected onto NUMA tasks at the hardware level. We report the results of an experimental study, demonstrating the capability of our solution to achieve the order of 15% better performance compared to state-of-the-art solutions already suited for multi-core environments.},
location = {Prague, Czech Republic},
}
@InProceedings{Pic20,
author = {Piccione, Andrea and Pellegrini, Alessandro},
booktitle = {Proceedings of the 24th IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
title = {Agent-based Modeling and Simulation for Emergency Scenarios: A Holistic Approach},
year = {2020},
month = sep,
publisher = {IEEE},
series = {DS-RT},
abstract = {Agent-based Modeling and Simulation is a powerful technique which allows to study the interactions in complex systems, and allows to explore or even foresee the emergence of more complicated properties or behaviors related to the interaction among the simpler agents in the environment. In the context of emergency or crisis scenarios, Agent-based Modeling and Simulation can allow to effectively study emergency plans, with the goal of assessing their viability, also with respect to the number of possible fatalities. In this paper, we analyze Agent-based Modeling and Simulation for crisis scenarios from a methodological and empirical point of view, with the goal of identifying what are the behavioral parameters that a model should encompass, in order for the results of the simulation to be useful for emergency plan assessment and/or compilation. We also experimentally provide a characterization of the effects of such behavioral parameters.},
location = {Prague, Czech Republic},
}
@InProceedings{Con20,
author = {Conoci, Stefano and Ianni, Mauro and Marotta, Romolo and Pellegrini, Alessandro},
booktitle = {Proceedings of the 2020 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Autonomic Power Management in Speculative Simulation Runtime Environments},
year = {2020},
month = jun,
publisher = {ACM},
series = {PADS},
abstract = {While transitioning to exascale systems, it has become clear that power management plays a fundamental role to support a viable utilization of the underlying hardware, also performance-wise. To meet power restrictions imposed by future exascale supercomputers, runtime environments will be required to enforce self-tuning schemes to run dynamic workloads under an imposed power cap. Literature results show that, for a wide class of multi-threaded applications, tuning both the degree of parallelism and frequency/voltage of cores allows a more effective use of the budget, compared to techniques that use only one of these mechanisms in isolation.
In this paper, we explore the issues associated with applying these techniques on speculative Time-Warp based simulation runtime environments. We discuss how the differences in two antithetical Time Warp-based simulation environments impact the obtained results. Our assessment confirms that the performance gains achieved through a proper allocation of the power budget can be significant. We also identify the research challenges that would make these form of self-tuning more broadly applicable.},
doi = {10.1145/3384441.3395980},
location = {Miami, FL, USA},
}
@InProceedings{Pri20b,
author = {Principe, Matteo and Piccione, Andrea and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2020 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Approximated Rollbacks},
year = {2020},
month = jun,
publisher = {ACM},
series = {PADS},
abstract = {A rollback operation in a speculative parallel discrete event simulator has traditionally targeted the perfect reconstruction of the state to be restored after a timestamp-order violation. This imposes that the rollback support entails specific capabilities and consequently pays given costs. In this article we propose approximated rollbacks, which allow a simulation object to perfectly realign its virtual time to the timestamp of the state to be restored, but lead the reconstructed state to be an approximation of what it should really be. The advantage is an important reduction of the cost for managing the state restore task in a rollback phase, as well as for managing the activities (i.e. state saving) that actually enable rollbacks to be executed. Our proposal is suited for stochastic simulations, and explores a tradeoff between the statistical representativeness of the outcome of the simulation run and the execution performance. We provide mechanisms that enable the application programmer to control this tradeoff, as well as simulation-platform level mechanisms that constitute the basis for managing approximate rollbacks in general simulation scenarios. A study on the aforementioned tradeoff is also presented.},
doi = {10.1145/3384441.3395984},
location = {Miami, FL, USA},
badges = {available,reusable,reproduced},
}
@InProceedings{Sil20,
author = {Silvesti, Emiliano and Milia, Cristian and Marotta, Romolo and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2020 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Exploiting Inter-Processor-Interrupts for Virtual-Time Coordination in Speculative Parallel Discrete Event Simulation},
year = {2020},
month = jun,
publisher = {ACM},
series = {PADS},
abstract = {Reducing the waste of resource usage (e.g., CPU-cycles) when a causality error occurs in speculative parallel discrete event simulation (PDES) is still a core objective. In this article,
we target this objective in the context of speculative PDES run on top of shared-memory machines. We propose an Operating System
approach that is based on the exploitation of the Inter-Processor-Interrupt (IPI) facility offered by off-the-shelf hardware chip sets, which enables cross-CPU-core control of the execution flow of threads. As soon as a thread $T$ produces a new event placed in the past virtual time of a simulation object currently run by another thread $T'$, our IPI-based support allows $T$ to change the execution flow of $T'$---with very minimal delay---so to enable the early squash of the currently processed (and no longer consistent) event. Our solution is fully transparent to the application level code, and is coupled with a lightweight heuristic-based mechanism that determines the actual goodness of killing thread $T'$ via the IPI (rather than skipping the IPI send) depending on the expected residual execution time of the incorrect event being processed. We integrated our proposal within the speculative open-source USE (Ultimate Share Everything) PDES package, and we report experimental results obtained by running various PDES models on top of two shared-memory hardware architectures equipped with 32 and 24 (48 Hyper-threads) CPU-cores, which demonstrate the effectiveness of our proposal.},
doi = {10.1145/3384441.3395985},
location = {Miami, FL, USA},
}
@InProceedings{Alt20,
author = {Altamura, Lorenzo and Conoci, Stefano and Pellegrini, Alessandro},
booktitle = {15th International Conference on High Performance and Embedded Architecture and Compilation Workshops},
title = {Asymmetric Computation for Speculative Heterogeneous HPC},
year = {2020},
month = jan,
series = {HiPEAC},
abstract = {HPC applications on future exascale systems will demand for runtime environments able to transparently manage the complexity of the underlying heterogeneous hardware. In this abstract, we discuss a computation model for speculative HPC applications, able to deliver non-minimal performance increase and significant energy savings. This model can be easily adapted to multiple heterogeneous hardware families with minor effort, and it can autonomically and promptly reassign units of work to different hardware classes. Our design jointly targets performance and energy efficiency. We also provide a preliminary experimental evaluation of our design.},
location = {Bologna, Italy},
}
@InProceedings{Car19,
author = {Carnà, Stefano and Ferracci, Serena and De Santis, Emanuele and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2019 Winter Simulation Conference},
title = {Hardware-assisted Incremental Checkpointing in Speculative Parallel Discrete Event Simulation},
year = {2019},
month = dec,
publisher = {IEEE},
series = {WSC},
abstract = {Nowadays hardware platforms offer a plethora of innovative facities for profiling the execution of programs. Most of them have been exploited as tools for program characterization, thus being used as kind of program-external observers. In this article we take the opposite perspective where hardware profiling facilities are exploited to execute core functional tasks for the correct and efficient execution of speculative Parallel Discrete Event Simulation (PDES) applications. In more detail we exploit them—specifically, the ones offered by Intel x86-64 processors—to build a hardware-supported incremental checkpointing solution that enables the reduction of the event-execution cost in speculative PDES compared to the software-based counterpart. We integrated our solution in the open source ROOT-Sim runtime environment, thus making it available for exploitation.},
location = {Washington, DC, USA},
}
@InProceedings{Pic19,
author = {Piccione, Andrea and Principe, Matteo and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2019 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {An Agent-Based Simulation API for Speculative PDES Runtime Environments},
year = {2019},
month = jun,
pages = {83--94},
publisher = {ACM},
series = {PADS},
abstract = {Agent-Based Modeling and Simulation (ABMS) is an effective paradigm to model systems exhibiting complex interactions, also with the goal of studying the emergent behavior of these systems. While ABMS has been effectively used in many disciplines, many successful models are still run only sequentially. Relying on simple and easy-to-use languages such as NetLogo limits the possibility to benefit from more effective runtime paradigms, such as speculative Parallel Discrete Event Simulation (PDES). In this paper, we discuss a semantically-rich API allowing to implement Agent-Based Models in a simple and effective way. We also describe the critical points which should be taken into account to implement this API in a speculative PDES environment, to scale up simulations on distributed massively-parallel clusters. We present an experimental assessment showing how our proposal allows to implement complicated interactions with a reduced complexity, while delivering a non-negligible performance increase.},
doi = {10.1145/3316480.3322890},
location = {Chicago, IL, USA},
badges = {available,reusable},
}
@InProceedings{Mar19,
author = {Marotta, Romolo and Ianni, Mauro and Scarselli, Andrea and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 19th International Symposium on Cluster, Cloud and Grid Computing},
title = {NBBS: A Non-blocking Buddy System for Multi-core Machines},
year = {2019},
month = may,
pages = {11--20},
publisher = {IEEE Computer Society},
series = {CCGrid},
abstract = {Common implementations of core memory allocation components, like the Linux buddy system, handle concurrent allocation/release requests by synchronizing threads via spinlocks. This approach is not prone to scale, a problem that has been addressed in the literature by introducing layered allocation services or replicating the core allocators—the bottom most ones within the layered architecture. Both these solutions tend to reduce the pressure of actual concurrent accesses to each individual core allocator. In this article we explore an alternative approach to scalability of memory allocation/release, which can be still combined with those literature proposals. We present a fully non-blocking buddy-system, where threads performing concurrent allocations/releases do not undergo any spin-lock based synchronization. Our solution allows threads to proceed in parallel, and commit their allocations/releases unless a conflict is materialized while handling the allocator metadata. Conflict detection relies on atomic Read-Modify-Write (RMW) machine instructions. Beyond improving scalability and performance, our solution can also avoid wasting clock cycles for spin-lock operations by threads that could in principle carry out their memory allocations/releases in full concurrency.},
doi = {10.1109/CCGRID.2019.00011},
location = {Larnaca, Cyprus},
}
@InProceedings{Eco18,
author = {Economo, Simone and Silvestri, Emiliano and Di Sanzo, Pierangelo and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 24th International Conference on Parallel and Distributed Systems},
title = {Model-based Proactive Read-validation in Transaction Processing Systems},
year = {2018},
month = dec,
pages = {481--488},
publisher = {IEEE Computer Society},
series = {ICPADS},
abstract = {Concurrency control protocols based on read-validation schemes allow transactions which are doomed to abort to still run until a subsequent validation check reveals them as invalid. These late aborts do not favor the reduction of wasted computation and can penalize performance. To counteract this problem, we present an analytical model that predicts the abort probability of transactions handled via read-validation schemes. Our goal is to determine what are the suited points—along a transaction lifetime—to carry out a validation check. This may lead to early aborting doomed transactions, thus saving CPU time. We show how to exploit the abort probability predictions returned by the model in combination with a threshold-based scheme to trigger read-validations. We also show how this approach can definitely improve performance—leading up to 14% better turnaround—as demonstrated by some experiments carried out with a port of the TPC-C benchmark to Software Transactional Memory},
doi = {10.1109/PADSW.2018.8644605},
location = {Singapore},
}
@InProceedings{Ian18b,
author = {Ianni, Mauro and Marotta, Romolo and Cingolani, Davide and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2018 Winter Simulation Conference},
title = {Optimizing Simulation on Shared-Memory Platforms: the Smart Cities Case},
year = {2018},
month = dec,
pages = {1969--1980},
publisher = {IEEE Computer Society},
series = {WSC},
abstract = {Modern advancements in computing architectures have been accompanied by new emergent paradigms to run Parallel Discrete Event Simulation models efficiently. Indeed, many new paradigms to effectively use the available underlying hardware have been proposed in the literature. Among these, the Share-Everything paradigm tackles massively-parallel shared-memory machines, in order to support speculative simulation by taking into account the limits and benefits related to this family of architectures. Previous results have shown how this paradigm outperforms traditional speculative strategies (such as data-separated Time Warp systems) whenever the granularity of executed events is small. In this paper, we show performance implications of this simulation-engine organization when the simulation models have a variable granularity. To this end, we have selected a traffic model, tailored for smart cities-oriented simulation. Our assessment illustrates the effects of the various tuning parameters related to the approach, opening to a higher understanding of this innovative paradigm.},
doi = {10.1109/WSC.2018.8632301},
location = {Gothenburg, Sweden},
}
@InProceedings{Mar18,
author = {Marotta, Romolo and Ianni, Mauro and Scarselli, Andrea and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {{IEEE} International Conference on Cluster Computing},
title = {A Non-blocking Buddy System for Scalable Memory Allocation on Multi-core Machines},
year = {2018},
month = sep,
pages = {164--165},
publisher = {IEEE Computer Society},
series = {CLUSTER},
abstract = {In this short paper we tackle the issue of scalability of core memory allocators, which is an orthogonal optimization with respect to reducing the pressure to core allocators by (a), (b), or (c). In particular, our contribution is the design of a non-blocking (lock-free) allocator implementing the buddy-system specification, where concurrent allocations/dellocations are not coordinated via spin-locks, but by only relying on individual Read-Modify-Write (RMW) instructions executed along the critical path of allocation/deallocation operations. These instructions are exploited to detect whether concurrent requests have conflicted on the same portion of the allocator metadata},
doi = {10.1109/CLUSTER.2018.00034},
location = {Belfast, UK},
}
@InProceedings{Ian18,
author = {Ianni, Mauro and Marotta, Romolo and Cingolani, Davide and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2018 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {The Ultimate Share-Everything PDES System},
year = {2018},
month = may,
pages = {73--84},
publisher = {ACM},
series = {PADS},
abstract = {The share-everything PDES (Parallel Discrete Event Simulation) paradigm is based on fully sharing the possibility to process any individual event across concurrent threads, rather than binding Logical Processes (LPs) and their events to threads. It allows concentrating, at any time, the computing power—the CPU-cores on board of a shared-memory machine—towards the unprocessed events that stand closest to the current commit horizon of the simulation run. This fruitfully biases the delivery of the computing power towards the hot portion of the model execution trajectory. In this article we present an innovative share-everything PDES system that provides (1) fully non-blocking coordination of the threads when accessing shared data structures and (2) fully speculative processing capabilities—Time Warp style processing—of the events. As we show via an experimental study, our proposal can cope with hard workloads where both classical Time Warp systems—based on LPs to threads binding—and previous share-everything proposals—not able to exploit fully speculative processing of the events—tend to fail in delivering adequate performance.},
doi = {10.1145/3200921.3200931},
location = {Rome, Italy},
badges = {available,reusable,reproduced},
}
@InProceedings{Con18b,
author = {Conoci, Stefano and Cingolani, Davide and Di~Sanzo, Pierangelo and Pellegrini, Alessandro and Ciciani, Bruno and Quaglia, Francesco},
booktitle = {Proceedings of the 2018 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {A Power Cap Oriented Time Warp Architecture},
year = {2018},
month = may,
pages = {97--100},
publisher = {ACM},
series = {PADS},
abstract = {Controlling power usage has become a core objective in modern computing platforms. In this article we present an innovative Time Warp architecture oriented to efficiently run parallel simulations under a power cap. Our architectural organization considers power usage as a foundational design principle, as opposed to classical power-unaware Time Warp design. We provide early experimental results showing the potential of our proposal.},
doi = {10.1145/3200921.3200930},
location = {Rome, Italy},
badges = {available,reusable,reproduced},
}
@InProceedings{Pri18,
author = {Principe, Matteo and Tocci, Tommaso and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2018 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Porting Event \& Cross-State Synchronization to the Cloud},
year = {2018},
month = may,
note = {Shortlisted for the Best Paper Award},
pages = {177--188},
publisher = {ACM},
series = {PADS},
abstract = {Along the years, Parallel Discrete Event Simulation (PDES) has been enriched with programming facilities to bypass state disjointness across the concurrent Logical Processes (LPs). New supports have been proposed, offering the programmer approaches alternative to message passing to code complex LPs’ relations. Along this path we find Event & Cross-State (ECS), which allows writing event handlers which can perform in-place accesses to the state of any LP, by simply relying on pointers. This programming model has been shipped with a runtime support enabling concurrent speculative execution of LPs limited to shared-memory machines. In this paper, we present the design of a middleware layer that allows ECS to be ported to distributed-memory clusters of machines. A core application of our middleware is to let ECS-coded models be hosted on top of (low-cost) resources from the Cloud. Overall, ECS-coded models no longer demand for powerful shared-memory machines to execute in reasonable time. Thanks to our solution, we retain indeed the possibility to rely on the enriched ECS programming model while still enabling deployments of PDES models on convenient (Cloudbased) infrastructures. An experimental assessment of our proposal is also provided.},
doi = {10.1145/3200921.3200929},
location = {Rome, Italy},
}
@InProceedings{Eco17,
author = {Economo, Simone and Silvestri, Emiliano and Di Sanzo, Pierangelo and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 16th IEEE International Symposium on Network Computing and Applications},
title = {Prompt Application-Transparent Transaction Revalidation in Software Transactional Memory},
year = {2017},
month = oct,
pages = {114--119},
publisher = {IEEE Computer Society},
series = {NCA},
abstract = {Software Transactional Memory (STM) allows encapsulating shared-data accesses within transactions, executed with atomicity and isolation guarantees. The assessment of the consistency of a running transaction is performed by the STM layer at specific points of its execution, such as when a read or write access to a shared object occurs, or upon a commit attempt. However, performance and energy efficiency issues may arise when no shared-data read/write operation occurs for a while along a thread running a transaction. In this scenario, the STM layer may not regain control for a considerable amount of time, thus not being able to early detect if such transaction has become inconsistent in the meantime. To tackle this problem we present an STM architecture that, thanks to a lightweight operating system support, is able to perform a fine-grain periodic (hence prompt) revalidation of running transactions. Our proposal targets Linux and x86 systems and has been integrated with the open source TinySTM package. Experimental results with a port of the TPC-C benchmark to STM environments show the effectiveness of our solution.},
doi = {10.1109/NCA.2017.8171349},
location = {Cambridge, MA, USA},
}
@InProceedings{Avr17,
author = {Avresky, Dimiter R. and Pellegrini, Alessandro and Di Sanzo, Pierangelo},
booktitle = {Proceedings of the 16th IEEE International Symposium on Network Computing and Applications},
title = {Machine Learning-based Management of Cloud Applications in Hybrid Clouds: a Hadoop Case Study},
year = {2017},
month = oct,
pages = {114--119},
publisher = {IEEE Computer Society},
series = {NCA},
abstract = {This paper illustrates the effort to integrate a machine learning-based framework which can predict the remaining time to failure of computing nodes with Hadoop applications. This work is part of a larger effort targeting the development of a cloud-oriented autonomic framework to increase the availability of applications subject to software anomalies, and to jointly improve their performance. The framework uses machine-learning, software rejuvenation, and load distribution techniques to proactively prevent failures. We believe that this work allows to set a possible path towards the definition of best practices for the development of systems to support autonomic management of cloud applications, illustrating what are the issues that should be addressed by the research community. Indeed, given the scale and the complexity of modern computing infrastructures, effective autonomic management approaches of cloud applications are becoming mandatory.},
doi = {10.1109/NCA.2017.8171352},
location = {Cambridge, MA, USA},
}
@InProceedings{Toc17,
author = {Tocci, Tommaso and Pellegrini, Alessandro and Quaglia, Francesco and Casanovas-García, Josep and Suzumura, Toyotaro},
booktitle = {Proceedings of the 21st IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
title = {ORCHESTRA: An Asynchronous Wait-Free Distributed GVT Algorithm},
year = {2017},
month = oct,
pages = {51--58},
publisher = {IEEE Computer Society},
series = {DS-RT},
abstract = {Taking advantage of computing capabilities oered by modern parallel and distributed architectures is fundamental to run large-scale simulation models based on the Parallel Discrete Event Simulation (PDES) paradigm. By relying on this computing organization, it is possible to eectively overcome both the power and the memory wall, which are core limiting aspects to deliver high-performance simulations. This is even more the case when relying on the speculative Time Warp synchronization protocol, which could be particularly memory greedy. At the same time, some form of coordination, such as the computation of the Global Virtual Time (GVT), is required by Time Warp Systems. These coordination points could easily become the bottleneck of large-scale simulations, hindering an efficient exploitation of the computing power oered by large supercomputing facilities. In this dissertation is presented ORCHESTRA, a coordination algorithm which is both wait-free and asynchronous. The nature of this algorithm allows any computing node to carry on simulation activities while the global agreement is reached, thus oering an eective building block to achieve scalable PDES. The general organization of ORCHESTRA could be adopted by different high-performance computing applications, thus paving the way to a more effective usage of modern computing infrastructures.},
doi = {10.1109/DISTRA.2017.8167666},
location = {Rome, Italy},
}
@InProceedings{Ian17c,
author = {Ianni, Mauro and Marotta, Romolo and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 21st IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
title = {Towards a Fully Non-blocking Share-everything PDES Platform},
year = {2017},
month = oct,
pages = {25--32},
publisher = {IEEE Computer Society},
series = {DS-RT},
abstract = {Shared-memory multi-core platforms are changing the nature of Parallel Discrete Event Simulation (PDES) because of the possibility to fully share the workload of events to be processed across threads. In this context, one rising PDES paradigm - referred to as share-everything PDES - is no longer based on the concept of (temporary) biding of simulation objects to worker threads. Rather, each worker threads can - at any time - pick from a fully shared event pool an event to process which can be destined to whatever simulation object. While attention has been posed on the design of concurrent shared pools, allowing non-blocking parallel operations, the scenario where two (or more) threads pick events destined to the same simulation object still lacks adequate synchronization support. In fact, these events are currently sequentialized and processed in a critical section touching the simulation object state, thus leading threads to mutually block each other. In this article we present the design of a share-everything speculative PDES engine that prevents mutual thread blocks because of the access to a same object state. In our design, the non-blocking property is seen as a vertical attribute of the engine (not only of the event pool). This vertical view demands for innovative event-dispatching schemes and, at the same time, innovative interactions with (and management of) the fully-shared event pool, which are features that we embed in our innovative design.},
doi = {10.1109/DISTRA.2017.8167663},
location = {Rome, Italy},
name = {ds-rt17c},
}
@InProceedings{Ian17b,
author = {Ianni, Mauro and Marotta, Romolo and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 21st IEEE/ACM International Symposium on Distributed Simulation and Real Time Applications},
title = {A Non-blocking Global Virtual Time Algorithm with Logarithmic Number of Memory Operations},
year = {2017},
month = oct,
note = {Shortlisted for the Best Paper Award},
pages = {17--24},
publisher = {IEEE Computer Society},
series = {DS-RT},
abstract = {The increasing diffusion of shared-memory multi-core machines has given rise to a change in the design of Parallel Discrete Event Simulation (PDES) platforms. In particular, the possibility to share large amounts of memory by many worker threads has lead to a boost in the adoption of non-blocking coordination algorithms, which have been proven to offer higher scalability when compared to their blocking counterparts based on critical sections. In this article we present an innovative non-blocking algorithm for computing Global Virtual Time (GVT)---namely, the current commit horizon---in multi-thread PDES engines to be run on top of multi-core machines. Beyond being non-blocking, our proposal has the advantage of providing a logarithmic (rather than linear) number of per-thread memory operations---read/write operations of values involved in the reduction for computing the GVT value---vs the amount of threads participating in the GVT computation. This allows for keeping low the actual CPU time that is required for determining the new GVT value. We compare our algorithm with a literature solution, still based on the non-blocking approach, but entailing a linear number of memory operations, quantifying the advantages from our proposal especially for very large numbers of threads participating in the GVT computation.},
doi = {10.1109/DISTRA.2017.8167662},
location = {Rome, Italy},
name = {ds-rt17b},
}
@InProceedings{Ian17,
author = {Ianni, Mauro and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2017 IEEE Cluster Conference},
title = {A Wait-free Multi-word Atomic (1,N) Register for Large-scale Data Sharing on Multi-core Machines},
year = {2017},
month = sep,
pages = {188--192},
publisher = {IEEE Computer Society},
series = {CLUSTER},
abstract = {We present a multi-word atomic (1,N) register for multi-core machines exploiting Read-Modify-Write (RMW) instructions to coordinate the writer and the readers in a waitfree manner. Our proposal, called Anonymous Readers Counting (ARC), enables large-scale data sharing by admitting up to $2^{32} - 2$ concurrent readers on off-the-shelf 64-bit machines, as opposed to the most advanced RMW-based approach which is limited to 58 readers. Further, ARC avoids multiple copies of the register content while accessing it—this affects classical register’s algorithms based on atomic read/write operations on single words. Thus, ARC allows for higher scalability with respect to the register size.},
doi = {10.1109/CLUSTER.2017.84},
location = {Honolulu, HI, USA},
}
@InProceedings{Cin17,
author = {Cingolani, Davide and Pellegrini, Alessandro and Schordan, Markus and Quaglia, Francesco and Jefferson, David R.},
booktitle = {Proceedings of the 2017 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {Dealing with Reversibility of Shared Libraries in PDES},
year = {2017},
month = may,
publisher = {ACM},
series = {PADS},
abstract = {State recoverability is a crucial aspect of speculative Time Warp-based Parallel Discrete Event Simulation. In the literature, we can identify three major classes of techniques to support the correct restoration of a previous simulation state upon the execution of a rollback operation: state checkpointing/restore, manual reverse computation and automatic reverse computation. The latter class has been recently supported by relying either on binary code instrumentation or on source-to-source code transformation. Nevertheless, both solutions are not intrinsically meant to support a reversible execution of third-party shared libraries, which can be pretty useful when implementing complex simulation models.
In this paper, we present an architectural solution (realized as a static C library) which allows to transparently instrument at runtime any third party shared library, with no need for any modification to the model's code. We also present a preliminary experimental evaluation, based on the integration of our library with the ROOT-Sim simulation engine.},
location = {Singapore},
name = {pads17b},
}
@InProceedings{Mar17,
author = {Marotta, Romolo and Ianni, Mauro and Pellegrini, Alessandro and Quaglia, Francesco},
booktitle = {Proceedings of the 2017 ACM SIGSIM Conference on Principles of Advanced Discrete Simulation},
title = {A Conflict-Resilient Lock-Free Calendar Queue for Scalable Share-Everything PDES Platforms},
year = {2017},
month = may,
pages = {41--52},
publisher = {ACM},
series = {PADS},
abstract = {Emerging share-everything Parallel Discrete Event Simulation (PDES) platforms rely on worker threads fully sharing the workload of events to be processed. These platforms require efficient event pool data structures enabling high concurrency of extraction/insertion operations. Non-blocking event pool algorithms are raising as promising solutions for this problem. However, the classical non-blocking paradigm leads concurrent conflicting operations, acting on a same portion of the event pool data structure, to abort and then retry. In this article we present a conflict-resilient non-blocking calendar queue that enables conflicting dequeue operations, concurrently attempting to extract the minimum element, to survive, thus improving the level of scalability of accesses to the hot portion of the data structure---namely the bucket to which the current locality of the events to be processed is bound. We have integrated our solution within an open source share-everything PDES platform and report the results of an experimental analysis of the proposed concurrent data structure compared to some literature solutions.},
doi = {10.1145/3064911.3064927},
location = {Singapore},
}
@InProceedings{Sil17,
author = {Silvestri, Emiliano and Economo, Simone and Di Sanzo, Pierangelo and Pellegrini, Alessandro and Quaglia, Francesco},