So I was tasked with taking an input file of the form
Code: Select all
blah - blah blah
114 - 107-113
115 - 58-65 66-80
116 - 66-80
117 - 81-93
118 - none
119 - none
120 - 51-57
121 - 58-65 81-84
122 - none
123 - none
124 - none
[...]
Code: Select all
%
114 107 108 109 110 111 112 113
%
115 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
...
%
123
%
124
...
convert.pl
Code: Select all
##########################################################################
# Created by:
# Steven A Wilson (Alex)
# Completed on: 04/18/2009
# Filename: convert.pl
#
# Output a number range as character literals
# Then format them correctly for a result set
#
# See 'perl convert.pl -h' for usage.
#
# Pipe 'cat file' as input
# It will send to parsed.txt or specified file for output
#
# Piped file must be of the form
# 999 - ((123-456)|(789))*
# If the section after "999 - " is anything but some sequence of numbers,
# no minor components are output for that line (So you get %\n999\n%...)
#
# For Dr. Jane Hayes' Traceability Challenge '09
#
##########################################################################
# Handle arguments ---------------
my $filename = "";
my $mode = "";
my $verbose = -1;
foreach (@ARGV) {
$str = $_; # Because I wrote $str before I knew how foreach worked
if (index($str, '-') == 0) {
if (index($str, 'a') == 1 || index($str, 'o') == 1) {
if ($mode ne "") {
die("Multiple file mode specifiers.\n");
}
if (index($str, 'a') == 1) {
$mode = ">>";
} elsif (index($str, 'o') == 1) {
$mode = ">";
}
} elsif (index($str, 'v') == 1 || index($str, 'q') == 1) {
if ($verbose != -1) {
die("Multiple verbosities specified.\n");
}
if (index($str, 'q') == 1) {
$verbose = 0;
} elsif (index($str, 'v') == 1) {
$verbose = 1;
}
} else {
die("Usage: $0 [-o|-a] [filename] [-v|-q]\n" .
"-o: Overwrite [filename]\n" .
"-a: Append to [filename] (default)\n" .
"-v: verbose output\n" .
"-q: quiet output (default)\n" .
"filename: file to write to. Default 'parsed.txt'.\n" .
"\tfilename cannot start with '-' cause I'm a bit lazy.\n\n" .
"Input is read from STDIN.\n" .
"$0 is equivalent to $0 -a parsed.txt -q\n" .
"NOTE: Order of the parameters doesn't matter.\n");
}
} else {
if ($filename ne "") {
die("Multiple filenames specified.\n");
}
$filename = $str;
}
}
# Parameter defaults -------------------------
if ($filename eq "") {
$filename = "parsed.txt";
}
if ($mode eq "") {
$mode = ">>";
}
if ($verbose == -1) {
$verbose = 0;
}
$filename = $mode . $filename;
# Open the output file -----------------------
my $opened = open(FILE, $filename);
if (! $opened) {
die("Cannot open $filename.\n");
}
# Parse the input ----------------------------
while (<STDIN>) {
my $startnum = -1;
my $endnum = -1;
my $lonenum = -1;
my $old2 = "";
my($line) = $_;
chomp($line);
if ($verbose) {
print "* Line: '$line'\n";
}
# Matches lines of the form
# 999 - 123-456 789-123 456 789
# Where all patterns after "999 - " can occur multiple times,
# however, only the first pattern is parsed at the moment,
# am going to do multiple passes on the same file.
#
# Appropriate logical group names were found by using
# http://www.regexlib.com/RETester.aspx
#
# $7 (the remainder of the line) includes the leading space
$line =~ s/^(\d+)[ ]-[ ](((\d+)-(\d+))|(\d+))?(.*)/$1 -$7/;
if ($1 ne "") {
# Change the input to the format:
# %\n
# MAJOR\tMINOR\tMINOR\tMINOR...
if ($verbose) {
print "\t* Matching for major case $1:\n";
}
print FILE "%\n";
print FILE "$1";
while ($2 ne $old2) {
$old2 = $2; # $2 doesn't get cleared if it isn't matched
if ($3 ne "") {
$startnum = atoi($4);
$endnum = atoi($5);
if ($verbose) {
print "\t\tThere is a range of numbers: $3.\n";
}
for ($count = $startnum; $count <= $endnum; $count++) {
print FILE "\t$count";
}
}
if ($6 ne "") {
if ($verbose) {
print "\t\tThere is a single match: $6.\n";
}
print FILE "\t$6";
}
$line =~ s/^(\d+)[ ]-[ ](((\d+)-(\d+))|(\d+))?(.*)/$1 -$7/;
} # End loop
if ($verbose) {
print "\tNo more matches for $1\n";
print "\tLine finished.\n";
}
print FILE "\n";
} else { # Essentially this line does not start "# -"
print "Line ('$line') does not match.\n";
}
}
close(FILE);
print "Done.\n";
sub atoi {
my $t;
foreach my $d (split(//, shift())) {
$t = $t * 10 + $d;
}
return $t;
}
The regular expression:
Code: Select all
$line =~ s/^(\d+)[ ]-[ ](((\d+)-(\d+))|(\d+))?(.*)/$1 -$7/;
Code: Select all
Initial string: "115 - 58-65 66-80"
$1 = 115, $2 = 58-65, $3 = (I forget), $4 = 58, $5 = 65, $6 = "", $7 = " 66-80"
After regular expression: "115 - 66-80"
$1 = 115, $2 = 66-80, $3 = (I forget), $4 = 66, $5 = 80, $6 = "", $7 = ""
After regular expression: "115 -"
$1 = 115, $2 = 66-80, $3 = "", $4 = "", $5 = "", $6 = "", $7 = "" # Yes I was actually getting a value in $2, hence the $old2 variable.
( (one or more digits followed by a - followed by one or more digits) or
(one or more digits) ), of which it stops once it finds the first possible match. The rest of the string is caught by (.*) which goes into variable $7.
This could probably be reduced but I wanted a special case for having a single number and not a range.
The regular expression makes multiple passes at the same line, generating the correct output as it goes, until the line is deemed to be completely parsed.
atoi:
Code: Select all
sub atoi {
my $t;
foreach my $d (split(//, shift())) {
$t = $t * 10 + $d;
}
return $t;
}