-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathequalPartition.pl
More file actions
executable file
·105 lines (80 loc) · 2.15 KB
/
equalPartition.pl
File metadata and controls
executable file
·105 lines (80 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/perl -w
sub printUsage {
print "Randomly allocate rows into output files row by row. Each output file has an equal (or fewer by 1) number of rows.\n";
print "Usage: ~ [-h y|n] [-a] <in.csv> <num_of_files> <out_stem>\n";
print " -h whether the input file has header or not\n";
print " -a\tto append results to existing results (if any)\n";
exit(1);
}
use Getopt::Std;
my(%options);
getopts("h:a", \%options);
my $APPEND = exists $options{"a"};
my $header = "U"; # unspecified
if(exists $options{"h"}) {
$header = $options{"h"};
}
if(scalar(@ARGV) != 3) {
printUsage();
}
use Util;
use Flat;
use math;
use Fcntl ':flock';
my $in;
if($header eq "U") {
$in = Flat->new1(shift @ARGV);
}
elsif($header eq "y") {
$in = Flat->new(shift @ARGV, 1);
}
elsif($options{"h"} eq "n") {
$in = Flat->new(shift @ARGV, 0);
}
else {
print "-h should be followed by either 'y' or 'n'\n";
printUsage();
}
my($num) = $in->getFieldIndex(shift @ARGV);
my($outStem) = shift @ARGV;
my(@fieldNames) = $in->getFieldNames();
my %fname2fh;
my $nrows = $in->getNumOfRows();
$in->reset();
my @fileIndice = math::util::randomize(map { $_ % $num + 1 } (0..($nrows - 1)));
#die "fileIndice = @fileIndice\n";
while($row = $in->readNextRow()) {
my $outFileIndex = shift @fileIndice;
# print "$outFileIndex\n";
# next;
my $fh;
if(exists $fname2fh{$outFileIndex}) {
$fh = $fname2fh{$outFileIndex};
}
else { # file does not exist yet
$fh = "OUT$outFileIndex";
my @fldNames = @fieldNames;
if($APPEND) {
open $fh, ">>$outStem.$outFileIndex.csv" or die $!;
}
else {
open $fh, "+>$outStem.$outFileIndex.csv" or die $!;
}
#disable lock for now: flock($fh, LOCK_EX);
if($in->hasHeader()) {
print $fh Flat::dataRowToString(@fldNames), "\n";
}
$fname2fh{$outFileIndex} = $fh;
}
print $fh join("\t", @{$row}), "\n";
}
# unlock the output files
# foreach $fh (values %val2outfile) {
# disable lock for now: flock($fh, LOCK_UN);
# }
# Util::run("gzip $fname", 1);
# close files
foreach $fh (values %fname2fh) {
close $fh;
}
print "DONE partition.pl at ", `date`;