Skip to content

Commit fec223b

Browse files
committed
Changed the behavior of auto-cleaning
- We no longer auto clean all the time, only when we detect a corruption - Additionally, we added the clean flag, which will clean the file and then quit. This is intended for when the process takes to long on clusters with a shared filesystem.
1 parent 2c2b109 commit fec223b

3 files changed

Lines changed: 38 additions & 8 deletions

File tree

src/checkpoint.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,9 @@ void checkpoint_t::clean() {
149149
return;
150150
}
151151
std::string backup_filename = _checkpoint_filename + ".bak";
152-
if (__MPI_RANK__ == 0 && needs_cleaning()) {
152+
if (__MPI_RANK__ == 0) {
153153
auto lock = write_lock<fcntl_lock_behavior::block>();
154+
lseek(_file_descriptor, 0, SEEK_SET);
154155
auto copy_fd = open(backup_filename.c_str(),
155156
O_RDWR | O_CREAT | O_APPEND | O_EXCL, 0640);
156157
if (copy_fd == -1) {
@@ -304,7 +305,7 @@ checkpoint_t::read_results() {
304305
return results;
305306
}
306307

307-
bool checkpoint_t::needs_cleaning(){
308+
bool checkpoint_t::needs_cleaning() {
308309
auto lock = write_lock<fcntl_lock_behavior::block>();
309310
auto current_fd = fcntl(_file_descriptor, F_DUPFD, 0);
310311

@@ -336,6 +337,7 @@ bool checkpoint_t::needs_cleaning(){
336337
debug_string(
337338
EMIT_LEVEL_WARNING,
338339
"Checkpoint file is corrupted, we will resume with what we can");
340+
close(current_fd);
339341
return true;
340342
}
341343
}

src/main.cpp

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,13 @@ static void print_usage() {
122122
<< " --verbose\n"
123123
<< " Increase the verbosity level. Can be repeated to\n"
124124
<< " level further.\n"
125+
<< " --clean\n"
126+
<< " Clean the checkpoint file and exit. Normally, this should\n"
127+
<< " not be needed, but occasionally cleaining on a multi-node\n"
128+
<< " system can take a lot of time. In that case, use this flag\n"
129+
<< " on a single node, which will make RootDigger clean the\n"
130+
<< " checkpoint file so that the job can be run quickly on\n"
131+
<< " multi-node systems.\n"
125132
<< std::endl;
126133
}
127134

@@ -151,8 +158,9 @@ cli_options_t parse_options(int argv, char **argc) {
151158
{"version", no_argument, 0, 0}, /* 21 */
152159
{"debug", no_argument, 0, 0}, /* 22 */
153160
{"mpi-debug", no_argument, 0, 0}, /* 23 */
154-
{"echo", no_argument, 0, 0}, /* 24 */
155-
{"help", no_argument, 0, 0}, /* 25 */
161+
{"clean", no_argument, 0, 0}, /* 24 */
162+
{"echo", no_argument, 0, 0}, /* 25 */
163+
{"help", no_argument, 0, 0}, /* 26 */
156164
{0, 0, 0, 0},
157165
};
158166

@@ -242,10 +250,13 @@ cli_options_t parse_options(int argv, char **argc) {
242250
case 23: // mpi-debug
243251
__VERBOSE__ = EMIT_LEVEL_MPI_DEBUG;
244252
break;
245-
case 24: // echo
253+
case 24: // clean
254+
cli_options.clean = true;
255+
break;
256+
case 25: // echo
246257
cli_options.echo = true;
247258
break;
248-
case 25: // help
259+
case 26: // help
249260
print_usage();
250261
std::exit(0);
251262
break;
@@ -289,6 +300,7 @@ void merge_options_checkpoint(cli_options_t &cli_options,
289300
checkpoint.load_options(checkpoint_options);
290301
checkpoint_options.threads = cli_options.threads;
291302
checkpoint_options.silent = cli_options.silent;
303+
checkpoint_options.clean = cli_options.clean;
292304

293305
std::swap(cli_options, checkpoint_options);
294306
}
@@ -336,7 +348,6 @@ int wrapped_main(int argv, char **argc) {
336348
MPI_Barrier(MPI_COMM_WORLD);
337349
#endif
338350

339-
340351
/* Use the tree path for the prefix */
341352
if (cli_options.prefix.empty()) {
342353
cli_options.prefix = cli_options.tree_filename;
@@ -345,8 +356,24 @@ int wrapped_main(int argv, char **argc) {
345356
checkpoint_t checkpoint(cli_options.prefix);
346357
merge_options_checkpoint(cli_options, checkpoint);
347358
if (__MPI_RANK__ == 0) {
359+
if (cli_options.clean) {
360+
debug_print(EMIT_LEVEL_IMPORTANT, "Cleaning the checkpoint file %s",
361+
checkpoint.get_filename().c_str());
362+
checkpoint.clean();
363+
#ifdef MPI_VERSION
364+
MPI_Barrier(MPI_COMM_WORLD);
365+
#endif
366+
return 0;
367+
}
348368
checkpoint.save_options(cli_options);
349-
checkpoint.clean();
369+
if (checkpoint.needs_cleaning()) {
370+
checkpoint.clean();
371+
}
372+
} else if (cli_options.clean) {
373+
#ifdef MPI_VERSION
374+
MPI_Barrier(MPI_COMM_WORLD);
375+
#endif
376+
return 0;
350377
}
351378
#ifdef MPI_VERSION
352379
MPI_Barrier(MPI_COMM_WORLD);

src/util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ struct cli_options_t {
156156
bool exhaustive = false;
157157
bool echo = false;
158158
bool invariant_sites = false;
159+
bool clean= false;
159160
initialized_flag_t early_stop;
160161

161162
bool operator==(const cli_options_t &other) const {

0 commit comments

Comments
 (0)