package File::Strfile;
our $VERSION = '0.02';
use 5.016;
use strict;
use warnings;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(%STRFLAGS);
use Carp;
use File::Spec;
use List::Util qw(shuffle sum);
our %STRFLAGS = (
RANDOM => 0x1,
ORDERED => 0x2,
ROTATED => 0x4,
);
my @VERSIONS = (1, 2);
my $STRFILE_HDR_LEN = 24;
# Strfile header format:
# uint32 version;
# uint32 strnum;
# uint32 longest str len;
# uint32 shortest str len;
# uint32 flags; (see %STRFLAGS)
# uint8[4] long-aligned space
# [0] is delimit char
# Header is Big-Endian.
sub new {
my $class = shift;
my $src = shift;
my $param = shift;
my $self = {
SrcFile => File::Spec->rel2abs($src),
_srcfh => undef,
Version => 1,
StrNum => 0,
LongLen => 0,
ShortLen => 0xffffffff,
Flags => 0,
Delimit => '%',
Offsets => [],
};
bless $self, $class;
open $self->{_srcfh}, '<', $self->{SrcFile}
or croak "Failed to open $self->{SrcFile} for reading: $!";
if ($param->{DataFile}) {
$self->read_strfile($param->{DataFile});
} else {
$self->_create_strfile_data();
if ($param->{Delimit}) {
$self->{Delimit} = unpack "a", $param->{Delimit};
}
}
if (defined $param->{Version}) {
croak "$param->{Version} is an invalid strfile version"
unless _version_check($param->{Version});
$self->{Version} = $param->{Version};
}
# Order flag gets priority over Random
if ($param->{FcOrder}) {
$self->order(1);
} elsif ($param->{Order}) {
$self->order();
} elsif ($param->{Random}) {
$self->random();
}
$self->{Flags} |= $STRFLAGS{ROTATED} if $param->{Rotate};
return $self;
}
sub _version_check {
my $ver = shift;
return (grep { $ver == $_ } @VERSIONS) ? 1 : 0;
}
sub _create_strfile_data {
my $self = shift;
seek $self->{_srcfh}, 0, 0;
# Each offset table must start with 0x00
push @{$self->{Offsets}}, tell $self->{_srcfh};
my $coff = 0;
my $loff = 0;
my $curlen = 0;
my $l = '';
while (defined $l) {
$l = readline $self->{_srcfh};
if (not defined $l or $l eq "$self->{Delimit}\n") {
$coff = tell $self->{_srcfh};
$curlen = $coff - $loff - (length $l // 0);
$loff = $coff;
next unless $curlen;
push @{$self->{Offsets}}, $coff;
$self->{StrNum}++;
if ($curlen < $self->{ShortLen}) {
$self->{ShortLen} = $curlen;
}
if ($curlen > $self->{LongLen}) {
$self->{LongLen} = $curlen;
}
}
}
$self->{Version} = 1;
}
sub read_strfile {
my $self = shift;
my $file = shift;
open my $fh, '<', $file or croak "Failed to open $file for reading: $!";
binmode $fh;
read $fh, my ($buf), $STRFILE_HDR_LEN;
(
$self->{Version},
$self->{StrNum},
$self->{LongLen},
$self->{ShortLen},
$self->{Flags},
my $delim,
# We're ignoring 3 padding bytes
) = unpack "N N N N N a", $buf;
unless (_version_check($self->{Version})) {
croak "$file bogus strfile";
}
if ($self->{LongLen} < $self->{ShortLen}) {
croak "$file bogus strfile";
}
if ($self->{Flags} > sum values %STRFLAGS) {
croak "$file bogus strfile";
}
$self->{DataFile} = $file;
$self->{Offsets} = [];
foreach my $i (0 .. $self->{StrNum}) {
my $off;
# v1 strfiles use 64-bit offsets
if ($self->{Version} == 1) {
read $fh, $off, 8;
(my ($u), $self->{Offsets}->[$i]) = unpack "N N", $off;
croak "Offset $i exceeds 4GB" if $u;
# v2 strfiles use 32-bit offsets
} elsif ($self->{Version} == 2) {
read $fh, $off, 4;
$self->{Offsets}->[$i] = unpack "N", $off;
}
}
close $fh;
}
sub order {
my $self = shift;
my $fc = shift;
# Ignore leading non-alphanumeric characters.
my @strings = map { s/^[\W_]+//r } $self->strings();
@strings = map { fc } @strings if $fc;
my @offsets =
map { $self->{Offsets}->[$_] }
sort { $strings[$a] cmp $strings[$b] } (0 .. $self->{StrNum} - 1);
push @offsets, $self->{Offsets}->[$self->{StrNum}];
$self->{Offsets} = \@offsets;
$self->{Flags} |= $STRFLAGS{ORDERED};
}
sub random {
my $self = shift;
my @offsets = map {
$self->{Offsets}->[$_]
} shuffle(0 .. $self->{StrNum} - 1);
push @offsets, $self->{Offsets}->[$self->{StrNum}];
$self->{Offsets} = \@offsets;
$self->{Flags} |= $STRFLAGS{RANDOM};
# Unset Ordered flag, as it takes priority over Random
if ($self->{Flags} & $STRFLAGS{ORDERED}) {
$self->{Flags} -= $STRFLAGS{ORDERED};
}
}
sub string {
my $self = shift;
my $n = shift;
return undef if $n >= $self->{StrNum};
seek $self->{_srcfh}, $self->{Offsets}->[$n], 0;
my $string = '';
my $l = '';
while (defined $l) {
$l = readline $self->{_srcfh};
last if not defined $l or $l eq "$self->{Delimit}\n";
$string .= $l;
}
# ROT13
$string =~ tr/A-Za-z/N-ZA-Mn-za-m/ if $self->{Flags} & $STRFLAGS{ROTATED};
return $string;
}
sub strings {
my $self = shift;
return map { $self->string($_) } (0 .. $self->{StrNum} - 1);
}
sub strings_like {
my $self = shift;
my $re = shift;
return grep { /$re/m } $self->strings();
}
sub get {
my $self = shift;
my $get = shift;
return undef if $get =~ /^_/ or not defined $self->{$get};
return $self->{$get};
}
sub write_strfile {
my $self = shift;
my $file = shift // "$self->{SrcFile}.dat";
open my $fh, '>', $file or croak "Failed to open $file for writing: $!";
my $hdr = pack "N N N N N c c c c", (
$self->{Version},
$self->{StrNum},
$self->{LongLen},
$self->{ShortLen},
$self->{Flags},
ord $self->{Delimit},
0, 0, 0,
);
print { $fh } $hdr;
foreach my $i (0 .. $self->{StrNum}) {
my $off;
if ($self->{Version} == 1) {
$off = pack "N N", (0, $self->{Offsets}->[$i]);
} elsif ($self->{Version} == 2) {
$off = pack "N", $self->{Offsets}->[$i];
}
print { $fh } $off;
}
}
DESTROY {
my $self = shift;
close $self->{_srcfh};
}
1;
=head1 NAME
File::Strfile - OO strfile interface
=head1 SYNOPSIS
use File::Strfile;
$strfile = File::Strfile->new($src);
$strfile->read_strfile($datafile);
$strfile->random();
$strfile->order();
$str0 = $strfile->string(0);
foreach my $str ($strfile->strings()) {
...
}
$strfile->write_strfile($datafile);
=head1 DESCRIPTION
File::Strfile provides an object oriented interface for reading and writing
strfiles, a file format often associated with the classic UNIX program
L<fortune(6)>. Strfiles are used to provide random access to strings stored in
another file, called the strfile source. The source files
consists of a collection of strings seperated by delimiting lines, which are
lines containing only a single delimiting character, typically a percentage (%)
sign. The strfile data
files are usually stored in the same directory as the source files, with the
same name but with the ".dat" suffix added. They contain a header that describes
the strfile database and a table of offsets pointing to each string in the
source file.
This module only provides an interface for manipulating strfile data files, not
the source text files themselves.
=head1 Object Methods
=head2 File::Strfile->new($srcpath, [{opt => 'val'}])
Returns a new File::Strfile object. $srcpath is the path to the source strfile.
new() can be given a hash reference of options. Note that all options are
case-sensitive.
=over 4
=item DataFile
Path to the strfile-generated data file. Instead of new() creating strfile data
from scratch, it will read data the from the given data file by calling
read_strfile(). Some fields can be overrided by passing additional options.
=item Version
Set version for outputted strfile. The following are acceptable version numbers:
=over 4
=item 1
Original strfile version. Stores string offsets as unsigned 64-bit integars.
Most common. Default.
=item 2
Newer strfile version. Stores string offsets as unsigned 32-bit integars.
=back
=item Random
Randomize the order of string offsets.
=item Order
Order string offsets alphabetically.
=item FcOrder
Order string offsets alphabetically, case-insensitive.
=item Rotate
Mark the source file as being ROT-13 ciphered.
=item Delimit
Set delimitting character. Default is a percentage sign (%). This option does
not work with the DataFile option.
=back
new() dies upon failure.
=head2 $strfile->read_strfile($file)
Read strfile data from $file.
=head2 $strfile->order([$fc])
Order strings alphabetically. If $fc is true, sort is done insensitive to case.
=head2 $strfile->random()
Randomize the order of strings.
=head2 $strfile->string($n)
Get $n-th string from string file. Returns undef if $n-th string does not exist.
=head2 $strfile->strings()
Returns list of all strings in strfile, in the order specified by the offset
table.
=head2 $strfile->strings_like($re)
Return list of strings that evaluate true given the qr regex $re.
For example, to get every string that starts with 'YOW!':
my @yows = $strfile->strings_like(qr/^YOW!/)
Note that the 'm' (multiline) option is automatically used and does not need
to be specified.
=head2 $strfile->get($member)
Return value of $member in $strfile object. Note $member is case-sensitive.
The following are valid members:
=over 4
=item SrcFile
Absolute path to strfile source file.
=item Version
Version of $strfile.
=item StrNum
Number of strings in $strfile.
=item LongLen
Length (in bytes) of the longest string in $strfile.
=item ShortLen
Length (in bytes) of the shortest string in $strfile.
=item Flags
Flag bitfield for $strfile. See documentation for %STRFLAGS for what each
bitmask means.
=item Delimit
Delimitting character.
=item Offsets
Array ref of strfile offsets. The last offset will not be a string offset but
the EOF offset.
=back
On failure, get() returns undef.
=head2 $strfile->write_strfile([$file])
Write $strfile data file to either $file. If $file is not supplied, write to
source file path + '.dat' suffix.
=head1 Global Variables
=over 4
=item $File::Strfile::VERSION
File::Strfile version.
=item %File::Strfile::STRFLAGS
Hash of strfile flags and their bitmasks.
=over 4
=item RANDOM => 0x1
Strings were randomly sorted.
=item ORDERED => 0x2
Strings were sorted alphabetically. Takes priority over Random.
=item ROTATED => 0x4
Strings are ROT-13 ciphered.
=back
Able to be exported.
use File::Strfile qw(%STRFLAGS);
=back
=head1 EXAMPLES
Here is an example of a typical source strfile:
A can of ASPARAGUS, 73 pigeons, some LIVE ammo, and a FROZEN DAIQUIRI!!
%
A dwarf is passing out somewhere in Detroit!
%
A wide-eyed, innocent UNICORN, poised delicately in a MEADOW filled
with LILACS, LOLLIPOPS & small CHILDREN at the HUSH of twilight??
%
Actually, what I'd like is a little toy spaceship!!
=head1 RESTRICTIONS
Despite version 1 strfiles storing string offsets as unsigned 64-bit integars,
they are still read as 32-bit. This means that File::Strfile will not be
able to read strfile sources
larger than 4GB (about the size of 1,000 plaintext KJV Bibles).
File::Strfile tries to emulate the original BSD strfile's behavior as close as
possible, which means it will also inherit some of its quirks.
=head1 AUTHOR
Written by Samuel Young E<lt>L<samyoung12788@gmail.com>E<gt>.
=head1 COPYRIGHT
Copyright 2024, Samuel Young
This library is free software; you may redistribute it and/or
modify it under the same terms as Perl itself.
=head1 SEE ALSO
L<fortune(6)>, L<strfile(8)>
=cut