#!/usr/bin/perl -w # Create b-file for A257371. # Kevin Ryde, December 2020. # # Usage: perl a257371-bfile.pl foma 10000 # or perl a257371-bfile.pl hfst 2000 # or perl a257371-bfile.pl sfst 2000 # or perl a257371-bfile.pl flat 100 # # Input file: b000040.txt # Output file: b257371.txt # When foma method, temporary files: b257371-tempfile.foma # b257371-tempfile.out # When sfst method, temporary files: b257371-tempfile.sfst # b257371-tempfile-999.out # # Prime numbers (in decimal) are read from file b000040.txt from A000040. # A000040 includes a bigger a-file too if want to go further. # Any equivalent source of primes would suit. # # There are four methods: # foma program - generate and run a script, parse its output. # hfst program - generate and run a script, parse its output. # sfst program - generate and run a script, examine its compiled machines. # FLAT Perl module - not fast enough for more than a small b-file. # # Command line option "foma", "hfst", "sfst", or "flat", chooses the method, # and then a number is the maximum n to go up to (inclusive). # # Foma is the fastest method. HFST is not quite as fast. # SFST is good but here naive use of its command-line tools holds it back. # FLAT is pure-Perl and included only for small n. # # HFST has a choice of underlying library, which it calls a "format". # This includes built-in copies of Foma, SFST, and OpenFst (libfst). # On the command line here, select a particular format like # # perl a257371-bfile.pl hfst-openfst-tropical # # which runs "hfst-xfst --format=openfst-tropical ..." # # Multiple methods are offered as a consistency check since this sort of # sequence is highly dependent on moderately complex computer state machine # manipulations. use 5.006; use strict; use warnings; use autodie ':all'; # including die on failed system() use List::Util 'max'; $|=1; my $want_num_primes = 1000; my $primes_filename = 'b000040.txt'; my $output_bfile_filename = 'b257371.txt'; my $foma_program = 'foma -f'; my $foma_script_filename = 'b257371-tempfile.foma'; my $foma_out_filename = 'b257371-tempfile.out'; my $sfst_compiler_program = 'fst-compiler'; my $sfst_print_program = 'fst-print'; my $sfst_script_filename = 'b257371-tempfile.sfst'; my $sfst_out_filename_fmt = 'b257371-tempfile-%s.out'; my $method; foreach my $arg (@ARGV) { if ($arg =~ /^[0-9]+$/) { # number $want_num_primes = $arg; } else { # method if (defined $method) { die "duplicate method choice: $method and $arg"; } $method = $arg; } } if (! defined $method) { $method = 'foma'; } print "Create $output_bfile_filename n=1 to n=$want_num_primes, by method $method ...\n"; my $start_time = time(); if ($method =~ /^hfst(-(.*))?/) { # hfst, possibly with choice of format, treated like foma my $format = ($2 ? " --format=$2" : ""); $method = 'foma'; $foma_program = "hfst-xfst$format -F"; } { open my $primes, '<', $primes_filename; END { close $primes; } # Return ($n, $prime) read from b000040.txt sub nextprime { while (defined(my $line = readline $primes)) { next if $line =~ /^\s*(#|\s*$)/; # comments and blanks $line =~ /^(\d+)\s+(\d+)/ or die "oops, unrecognised line from $primes_filename: \"$line\""; my $n = $1; my $prime = $2; return ($n, $prime); } return (); # end of file } } open my $bfile, '>', $output_bfile_filename; if ($method eq 'foma') { # Foma or HFST Method # ------------------- # # A big script file b257371-tempfile.foma is created. Foma and HFST take # a similar input syntax (Xerox style) and this script suits either. # # Bit strings for the primes are letters "a" and "b" instead of bits 0 and # 1 since 0 means the empty regex. The script maintains a current state # machine matching all up to prime(n), then unions in the next string # prime(n+1), and so on. # # The state counts in A257371 are for a "complete" DFA, meaning every # state has a destination for both bits 0 and 1. This includes a # "non-accepting sink" state for anything not matched by combinations of # the relevant primes. Foma and HFST minimization omit such a sink. # Every "(primes)*" regexp has infinite non-accepting strings # (eg. 000...), so state count +1 for a sink. Foma has a "complete net" # to add an actual such state, but that doesn't work in HFST it seems, and # it's a touch faster without. # # The output from foma or hfst-xfst is parsed to get its state counts. # An "echo" from the script indicates what "n" we're up-to in the output. # This parse will be highly dependent on the respective program output # format. The code was created for Foma version 0.9.18 and HFST 3.15.1. # # Foma is quite fast so a run for 1000 primes should take only a few # seconds even on a nowadays modest CPU. A run to 10000 needs only a # little patience (watch b257371-tempfile.out for progress). Bigger with # a bigger input file of primes should be feasible. { open my $foma_script, '>', $foma_script_filename; my @bit_to_letter = ('a','b'); # letters a,b for bits 0,1 while (my ($n,$prime) = nextprime()) { last if $n > $want_num_primes; my $re = sprintf '%b', $prime; # in binary $re =~ s/./ $bit_to_letter[$&]/g; # as letters a,b and space between print $foma_script "regex $re;\n"; if ($n > 1) { print $foma_script "union net\n"; # newprime | prevprimes } print $foma_script "zero-plus net\n"; # (newprime | prevprimes)* print $foma_script "echo Prime $n, decimal $prime = string $re\n"; print $foma_script "echo\n"; print $foma_script "\n"; last if $n >= $want_num_primes; } close $foma_script; } { my $command = "$foma_program $foma_script_filename >$foma_out_filename"; print "Run: $command\n"; system $command; } { my $num_states = 0; open my $from_foma, '<', $foma_out_filename; while (defined(my $line = readline $from_foma)) { if ($line =~ /(\d+) states/) { # foma output line like: # 1.2 kB. 37 states, 68 arcs, Cyclic. # hfst output line like # ? bytes. 37 states, 68 arcs, ? paths $num_states = $1 + 1; # +1 to count a non-accepting sink } elsif ($line =~ /Prime (\d+)/) { # echo in the script, after each completed state machine my $n = $1; defined $num_states or die "oops, unrecognised foma output"; print $bfile "$n $num_states\n"; undef $num_states; } } close $from_foma; } } elsif ($method eq 'sfst') { # SFST Method # ----------- # # A big sfst compiler script b257371-tempfile.sfst is created ready for # fst-compiler. The script maintains a current state machine $P$ matching # all all combinations up to prime(n), then unions in the next string # prime(n+1), and so on. Each state machine is written to a file like # b257371-tempfile-123.out. This is a binary format. Its contents are # examined by running fst-print and parsing the output. # # The state counts in A257371 are a "complete" DFA, meaning every state # has a destination for both bits 0 and 1. This includes a "non-accepting # sink" state for anything not matched by combinations of the relevant # primes. SFST's minimization omits a non-accepting sink. The parse of # fst-print's output here watches for any omitted transitions, which means # an omitted sink, and adds +1 if any such. There's always such an # omitted sink in the state machines here (eg. strings 000...), but # watching the transitions is a generic way. # # The fst-print output is "AT&T" text format. The parse here is quite # tight so as not to silently give the wrong result if something # unrecognised in the output might be relevant. The code here was created # for sfst version 1.4.7b. # # SFST is quite fast, but writing a full state machine file and then # reading it back with an fst-print run introduces a lot of overhead. # A run to 1000 primes is fast even on a nowadays modest CPU. A run for # 10000 primes is feasible but will use multi-megabytes of temporary # files, and seems to peak at a lot of memory use too. { open my $sfst_script, '>', $sfst_script_filename; while (my ($n,$prime) = nextprime()) { last if $n > $want_num_primes; my $binary = sprintf '%b', $prime; # in binary if ($n == 1) { print $sfst_script "\$P\$ = ($binary)*\n"; } else { print $sfst_script "\$P\$ = (\$P\$ | $binary)*\n"; } my $sfst_out_filename = sprintf $sfst_out_filename_fmt, $n; printf $sfst_script "\$P\$ >> \"$sfst_out_filename\"\n"; } print $sfst_script "0\n"; # dummy final compiler output close $sfst_script; } { my $sfst_dummy_filename = sprintf $sfst_out_filename_fmt, 'dummy'; my $command = "$sfst_compiler_program $sfst_script_filename $sfst_dummy_filename"; print "Run: $command\n"; system $command; } { open my $bfile, '>', $output_bfile_filename; my $total_size = 0; foreach my $n (1 .. $want_num_primes) { my $sfst_out_filename = sprintf $sfst_out_filename_fmt, $n; my $command = "$sfst_print_program $sfst_out_filename"; if ($n==1) { print "Run: $command (etc)\n"; } my $str = `$command`; $? == 0 or die "oops, error exit from $sfst_print_program"; my $max_state = -1; my $count_transitions = 0; foreach my $line (split /\n/, $str) { if (my ($from,$to,$symbol) = ($line =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)$/)) { # a transition $max_state = max($max_state, $from, $to); } elsif ($line =~ /^\d+$/) { # an accepting state } else { die "oops unrecognised $sfst_print_program output: ",$line; } } my $num_states = $max_state + 1; if ($count_transitions != 2*$num_states) { # full is 2 transitions per state, anything less means omitted sink $num_states++; } print $bfile "$n $num_states\n"; $total_size += -s $sfst_out_filename; unlink $sfst_out_filename; } print "Total size of sfst state machine temporary files was $total_size\n"; } } elsif ($method eq 'flat') { # FLAT Method # ----------- # # Perl module FLAT.pm is pure Perl code, so needs no external programs, # but, circa its version 0.9, its minimization is not fast enough for more # than a very small b-file. # # FLAT minimization retains a non-accepting sink for anything not matched, # so its $fa->num_states is the desired a(n). require FLAT; my $fa; while (my ($n,$prime) = nextprime()) { last if $n > $want_num_primes; my $binary = sprintf '%b', $prime; # in binary my $t = FLAT::Regex->new($binary); if (! defined $fa) { $fa = $t->star; # initial newprime* } else { $fa = $fa->as_nfa; # Nasty hack for FLAT version 0.9.4 where its $fa->as_nfa() on a DFA # leaves the object still blessed into FLAT::DFA, causing infinite # recursion in the union() method. Don't like to muck about with its # internals, but can at least presume that after the problem is fixed # as_nfa() will not be isa FLAT::DFA::Minimal. if ($fa->isa('FLAT::DFA::Minimal')) { bless $fa, 'FLAT::NFA'; } $fa = $fa->union($fa,$t)->star; # (newprime | prevprimes)* } $fa = $fa->as_min_dfa; my $num_states = $fa->num_states; print $bfile "$n $num_states\n"; } } else { die "unrecognised method: $method"; } close $bfile; my $elapsed_time = time() - $start_time; print "Elapsed time $elapsed_time seconds, $output_bfile_filename size ", -s $output_bfile_filename," bytes\n"; exit 0