/* Kevin Ryde, September 2024

   Usage: ./a.out [-v] [start|resume]

   This is some C code using GNU C features and for a 64 bit build
   to find terms of

       A372707(n) = smallest starting x requiring n steps
         to reach 0 under the map x -> step(x)
         where step(x) = 2^x mod x = A015910(x).

   The method is an explicit search through starting x, with the
   trajectory of x stopped early if a step drops by so much that
   previous terms show x is not n steps.


   Command Line
   ------------

   Output is b-file style so for instance a(12)=34305,

       12 34305

   Optional parameter "-v" or doubled -v -v prints some more verbose
   information about progress.


   Save and Resume
   ---------------

   The current n, x, and previous terms, are saved to

       saved-state.txt

   at 5 minute intervals and on kill (SIGINT, SIGTERM, SIGHUP).
   The search can be resumed by

       ./a.out resume

   This is intended for a long run which might have to be restarted.
   saved-state.txt is like

        n = 4
        x = 26
        0 0
        1 1
        2 3
        3 18

   n is the next target n steps sought.
   x is the next candidate x to consider.
   b-file style previous terms follow for i = 0..n-1.
   Progress information then follows and is ignored on resume.


   Compile Time
   ------------

   The default #define USE_REDC 1 selects REDC modular multiplies,
   rather than ordinary divisions.  REDC trades 1 div for 2 mul and
   certain pre and post calculations and is usually faster on
   current CPUs.

   umul_ppmm() and udiv_qrnnd() are "longlong.h" style asm macros.
   Macros for x86_64 are copied here.  Grab them from longlong.h
   for other CPUs.

   If no suitable asm, then GNU C type __int128 is used as a fallback.
   This is likely to be slower than asm, in particular division is
   probably a libgcc call and allows for quotient bigger than 64 bits
   which doesn't happen here.

   A 32 bit build is not targeted since the known terms already
   exceed 32 bits, and really need the speed of 64 bit operations.


   REDC Speed
   ----------

   The comparison for USE_REDC or not is roughly

       uint64_2pow_mod_odd_by_redc() =
           6 mul + 2 div pre and post REDC setups
             (possibly 1 of those div simultaneous with 4 muls)
           3 mul per bit of (odd part of exponent e) >> 6
             (ie. 6 bits shifted off)

       uint64_2pow_mod_by_div()
           1 mul + 1 div per bit of exponent e >> (6 or 7)
             (ie. 6 or 7 bits shifted off)

   When REDC is faster, there'll be a break-even point at some bit
   length of exponent e where the cost of the pre and post setups
   is overcome.  On an Intel with 14 cycle mul and 70 cycle div,
   this was as few as 10 bits long (which is almost instantly passed).

   In saved-state.txt, the steps per second statistic shows the
   overall rate achieved.  Can experiment, at desired size x,
   to see the rate with USE_REDC 1 versus 0.


   Trajectory Stop Condition
   -------------------------

   In is_n_steps(), a trajectory which cannot reach n steps is
   detected early by

       y = step(step(...step(x)))  after i steps applied
       if y < a(n-i) then x cannot be n steps

   The simplest stop is on the first step i=1

       if y = step(x) < a(n-1)  then x cannot be n steps

   This is y immediately dropping below the previous a(n-1).
   n-1 more steps are desired, but y < a(n-1) means at most n-2.

   A stop can occur after any number of steps.  Successive steps
   might remain good for a while, but then drop a large amount.

   Progress information in saved-state.txt includes average number
   of steps for the last chunk of searching.  Assuming a random
   0 <= step(x) < x, there's more chance of staying high when x is
   much larger than a(n-1).  So average steps tend to increase with
   x, for a given n.  The time for each step grows with the bit
   length of x.


   Things Not Done
   ---------------

   The alternative to trajectory stopping would be to remember
   num_steps(y) for every y < x so num_steps(x) = num_steps(y) + 1
   where y = step(x).  But up near x = 2^40 this is a trillion
   previous elements with random access.  In enough memory, that
   could reduce 1.5 to 2.5 steps per x to instead just 1 step.

   When x is odd, step(2*x) is a single extra modular square,

       step(2*x) = s or s+x, whichever is even
          where s = step(x)^2 mod x

   This is in the same modulus x as step(x) already uses,
   so is a cheap first step for the trajectory of 2*x.
   But would an early explore of 2*x help?  Or other 2^k*x ?
   The trajectory stopping rule only acts on descents below a(n-1),
   so is much less effective on bigger values.
*/


/* Set to 1 to use REDC for modular multiply.
   Set to 0 to use ordinary division.  */
#define USE_REDC  1

/* Set WANT_ASSERT to 1 to enable self-checks (slower).
   Set WANT_DEBUG to 1 for some rough development prints. */
#define WANT_ASSERT    0
#define WANT_DEBUG     0

#if ! WANT_ASSERT
#define NDEBUG
#endif

#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <signal.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdnoreturn.h>
#include <time.h>
#include <sys/time.h>


/* ------------------------------------------------------------------------ */
/* Generic */

#if WANT_DEBUG
#define DEBUG(expr)  do { expr; } while (0)
#else
#define DEBUG(expr)  do {       } while (0)
#endif

#ifdef __GNUC__
#define LIKELY(cond)    __builtin_expect((cond) != 0, 1)
#define UNLIKELY(cond)  __builtin_expect((cond) != 0, 0)
#define ATTRIBUTE_PRINTF __attribute__ ((format (printf,1,2)))
#define ATTRIBUTE_UNUSED __attribute__ ((unused))
#else
#define LIKELY(cond)    (cond)
#define UNLIKELY(cond)  (cond)
#define ATTRIBUTE_PRINTF
#define ATTRIBUTE_UNUSED
#endif

#define numberof(array)  (sizeof(array)/sizeof((array)[0]))

/* GNU C type __int128 available on a 64 bit target.  */
#if defined(__SIZEOF_INT128__)
#define HAVE_UINT128  1
typedef unsigned __int128  uint128_t;
#endif

noreturn ATTRIBUTE_PRINTF void
error (const char *format, ...)
{
  va_list ap;
  va_start (ap, format);
  vfprintf (stderr, format, ap);
  va_end (ap);
  exit(1);
}

/* with ptr=NULL allowed for initial malloc() too */
static void *
realloc_or_die(void *ptr, size_t size)
{
  ptr = (ptr==NULL ? malloc(size) : realloc(ptr, size));
  if (UNLIKELY (ptr == NULL))
    error("Cannot realloc() to %zu bytes\n", size);
  return (ptr);
}

static FILE *
fopen_or_die (const char *filename, const char *mode)
{
  FILE *fp = fopen(filename,mode);
  if (fp==NULL)
    error("Cannot %s %s\n",
          mode[0] == 'r' ? "open" : "create", filename);
  return (fp);
}

static void
ferror_die (FILE *fp, const char *filename, const char *action)
{
  if (ferror(fp))
    error("Error %s %s: %s\n",
          action, filename, strerror(errno));
}

static void
fclose_or_die (FILE *fp, const char *filename)
{
  if (fclose(fp) != 0)
    error("Error closing %s\n", filename);
}

static void
rename_or_die (const char *old_filename, const char *new_filename)
{
  if (rename(old_filename, new_filename) != 0)
    error("Error renaming %s to %s: %s\n",
          old_filename, new_filename, strerror(errno));
}

static void
sigaction_or_die (int signum, const struct sigaction *act, struct sigaction *oldact)
{
  if (sigaction (signum, act, oldact) != 0)
    error("Cannot sigaction signum=%d: %s\n",
          signum, strerror(errno));
}

static void
setitimer_or_die (int which, const struct itimerval *new_value,
                  struct itimerval *old_value)
{
  if (setitimer(which, new_value, old_value) != 0)
    error("Cannot setitimer %d: %s\n",
          which, strerror(errno));
}

/* CPU time consumed by this process so far, in seconds */
static double
cputime (void)
{
  struct timespec t;
  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t) != 0)
    error("Cannot clock_gettime() CPUTIME: %s\n",
          strerror(errno));
  return t.tv_sec + t.tv_nsec/1e9;
}


/* ------------------------------------------------------------------------ */
/* longlong.h macros

   umul_ppmm(w1,w0, u, v) sets words (w1,w0) = u*v,
   with w1 being the most significant word.

   udiv_qrnnd(q,r, n1,n0, dv) sets q,r to quotient and remainder of
   division (n1,n0) / dv where n1,n0 are two words of dividend and
   dv is one word divisor.  Rounding is downward so 0 <= r < dv.
   Must have n1 < dv so q fits in one word (even if q unused).
   On x86 CPUs, if n1 >= dv then expect SIGFPE like divide by zero.
*/

#if defined(__GNUC__) && defined(__x86_64__)
#define HAVE_LONGLONG_MACROS 1
#define LONGLONG_MACROS_TYPE  "x86_64"
/* these as per longlong.h */
typedef uint64_t UDItype;
#define umul_ppmm(w1,w0, u, v)          \
  __asm__ ("mul{q} %3"                  \
           : "=a" ((UDItype) (w0)),     \
             "=d" ((UDItype) (w1))      \
           : "%0" ((UDItype) (u)),      \
             "rm" ((UDItype) (v)))
#define udiv_qrnnd(q,r, n1,n0, dv)      \
  do {                                  \
    assert ((n1) < (dv));               \
    __asm__ ("div{q} %4"                \
             : "=a" ((UDItype) (q)),    \
               "=d" ((UDItype) (r))     \
             : "0" ((UDItype) (n0)),    \
               "1" ((UDItype) (n1)),    \
               "rm" ((UDItype) (dv)));  \
  } while (0)
#endif

/* Fallback on uint128_t if no asm macros. */
#if ! HAVE_LONGLONG_MACROS
#define HAVE_LONGLONG_MACROS  1
#define LONGLONG_MACROS_TYPE  "uint128_t"
#define umul_ppmm(w1,w0, u, v)                          \
  do {                                                  \
    uint128_t  umul_ppmm__w = (uint128_t)(u) * (v);     \
    (w1) = (uint64_t) (umul_ppmm__w >> 64);             \
    (w0) = (uint64_t) umul_ppmm__w;                     \
  } while (0)
#define udiv_qrnnd(q,r, n1,n0, dv)                      \
  do {                                                  \
    assert ((n1) < (dv));                               \
    uint128_t  udiv_qrnnd__n =                          \
      (((uint128_t)(n1)) << 64) + (n0);                 \
    uint64_t   udiv_qrnnd__dv = (dv);                   \
    (q) = (uint64_t) (udiv_qrnnd__n / udiv_qrnnd__dv);  \
    (r) = (uint64_t) (udiv_qrnnd__n % udiv_qrnnd__dv);  \
  } while (0)
#endif


/* ------------------------------------------------------------------------ */
/* uint64_t arithmetic */

#define UINT64_HIGHBIT  ((uint64_t)1 << 63)

static inline int
uint64_count_trailing_zeros (uint64_t x)
{
  assert (x != 0);
#ifdef __GNUC__
  if (sizeof(unsigned) >= sizeof(x))
    return (__builtin_ctz(x));
  if (sizeof(unsigned long) >= sizeof(x))
    return (__builtin_ctzl(x));
  if (sizeof(unsigned long long) >= sizeof(x))
    return (__builtin_ctzll(x));
#endif
  int ret = 0;
  while (! (x & 1)) { ret++; x>>=1; }
  return (ret);
}

static inline int
uint64_count_leading_zeros (uint64_t x)
{
  assert (x != 0);
#ifdef __GNUC__
  if (sizeof(unsigned) >= sizeof(x))
    return (__builtin_clz(x));
  if (sizeof(unsigned long) >= sizeof(x))
    return (__builtin_clzl(x));
  if (sizeof(unsigned long long) >= sizeof(x))
    return (__builtin_clzll(x));
#endif
  uint64_t mask = UINT64_HIGHBIT;
  int ret = 0;
  while (! (x & mask)) { mask>>=1; ret++; }
  return (ret);
}

static int
uint64_bit_length (uint64_t x)
{
  return (64 - uint64_count_leading_zeros(x));
}

/* uint64_mul_mod() returns x*y mod M.
   x and y must be small enough that the quotient fits a uint64_t,
   ie. x*y < M*2^64 so q = floor((x*y)/M) <= UINT64_MAX.  */
static inline uint64_t
uint64_mul_mod (uint64_t x, uint64_t y, uint64_t M)
{
  assert (M != 0);
  uint64_t  hi,lo, r;
  uint64_t  q  ATTRIBUTE_UNUSED;
  umul_ppmm (hi,lo, x,y);
  udiv_qrnnd (q,r, hi,lo, M);
  return (r);
}

/* uint64_2pow_mod_by_div(e,M) returns 2^e mod M.
   Must have e >=1 and M fits 63 bits, ie. M < 2^63. */
static inline uint64_t
uint64_2pow_mod_by_div (uint64_t e, uint64_t M)
{
  /* Double and step is done with the step (when required) combined
     into the double multiply.  This is at most ret*(2*ret) with
     ret = M-1, so at most 2*M^2 - 2*M + 1.  Since M < 2^63, this
     is < M*2^64 as required by uint64_mul_mod().
  */
  assert (e != 0);  /* not used with e=0 here */
  assert (M != 0 && M < UINT64_HIGHBIT);
  int c = uint64_count_leading_zeros(e);
  int e_len = 64 - c;  /* bit length of e */
  if (UNLIKELY (e_len <= 6)) {
    assert (1 <= e && e <= 63);
    return (((uint64_t)1 << e) % M);
  }

  e <<= c;             /* highest 1 bit now at position 63 */
  uint64_t ret;
  {
    /* Highest 6 or 7 bits of e as "initial_e"
       ret = 2^(initial_e) mod M
       Must have 2^(initial_e) < M*2^64 for udiv_qrnnd() doing mod M.
       7 bits is 64 <= initial_e <= 127 and "high" is the prospective
       high to udiv.  If high >= M then too big and must use 6 bits.
       6 bits is 32 <= initial_e <= 63 so fits in "low" alone.
       The chance of 7 bits depends on the size of M but is an easy
       check to save 1 mul+mod when possible.
    */
    int e_initial_len = 7;
    uint64_t e_initial = e >> (64 - e_initial_len);   /* high bits */
    assert (64 <= e_initial && e_initial <= 127);
    uint64_t high = (uint64_t)1 << (e_initial - 64);;
    uint64_t low = 0;
    if (high >= M) {
      e_initial >>= 1;   /* high 6 bits */
      assert (32 <= e_initial && e_initial <= 63);
      e_initial_len = 6;
      high = 0;
      low = (uint64_t)1 << e_initial;
    }
    uint64_t  q  ATTRIBUTE_UNUSED;
    udiv_qrnnd (q,ret, high,low, M);
    e <<= e_initial_len;
    e_len -= e_initial_len;
  }

  /* If 0 bit in e then double to ret^2 mod M.
     If 1 bit in e then double and step to 2*ret^2 mod M.
     e is shifted up so the bit tested is at position 63.  */
  while (--e_len >= 0) {
    ret = uint64_mul_mod (ret, ret << (e>>63), M);
    e <<= 1;   /* next lower bit of e to test */
  }
  return (ret);
}


/* ---------------------------------------------------------------------- */
/* Montgomery's REDC represents a remainder x mod M as an "xR",

       xR = x*R mod M    where R = 2^64
       (with M odd so M and R have no common factor)

   The advantage of this is that modular multiply x*y mod M
   can have its mod done by 2 multiplies instead of 1 divide.
   On many CPUs multiply is much faster than divide so a net speedup.

   Here REDC is used in uint64_t machine words x, xR, M, etc.
   step_by_redc() ensures M is odd.

   Modular multiplication is

       z = x*y mod M

   In xR = x*R, yR = y*R and desiring zR = z*R, all mod M, this is

       T = xR*yR = (Th,Tl) = Th*R + Tl     1x1 -> 2 word multiply
       zR = REDC(T)                        then reduction
   where
       REDC(T) = T * R^(-1) mod M

   T gets two factors of R, hence reduction by * R^(-1) mod M.
   This reduction of T is 2 further multiplies.
   The idea is to subtract from T some multiple of M which will
   reach a multiple of R.  This multiple is m*M where

       m = Tl*Minv mod R
         with pre-calculated  Minv = M^(-1) mod R

       t = (T - m*M) / R
         = Th - high(m*M)
       zR = / t       if t >= 0
            \ t + M   if t < 0

    By construction, low(m*M) = Tl so only high(m*M) is needed
    and subtract that from Th.

    In uint64_2pow_mod_odd_by_redc(), pre-calculation is 4 multiplies
    for redc_init_Minv(), and 2 divisions for initial "retR".
    Post-calculation is 2 multiplies (one REDC).

    Initial retR is the most significant 6 bits of exponent e,
    as h = 2^(e_high_6) mod M done by 1 division, then
    retR = redc_in_single(h) by 1 further division so that

        retR = (2^e_high) * R  mod M
             = 2^(e_high + 64) mod M

    The first remainder h mod M can be simultaneous with the Minv
    pre-calculation if the CPU allows.
*/

/* x,y are remainders mod M, with 0 <= x < M and 0 <= y <= M.
   Return difference x-y mod M.
   Must have M < 2^63 (so can use the sign bit in x-y).  */
static inline uint64_t
uint64_submod (uint64_t x, uint64_t y, uint64_t M)
{
  assert (x < M);
  assert (y <= M);
  assert (1 <= M && M < UINT64_HIGHBIT);
  x -= y;
  return (x & UINT64_HIGHBIT ? x+M : x);
}

/* inv = mod16bits_inverse_table[n] is the multiplicative inverse
   of 2*n+1 mod 2^16, meaning inv*(2*n+1) == 1 mod 2^16.  */
static uint16_t mod16bits_inverse_table[32768];

static void
init_mod16bits_inverse_table (void)
{
  size_t n,j;
  for (n=0; n < numberof(mod16bits_inverse_table); n++) {
    uint16_t M = 2*n+1;
    uint16_t inv = 1;   /* initially 1 bit good */
    for (j=1; j <= 4; j++)
      inv *= 2 - inv*M;  /* double 4 times so 2^4 = 16 bits good */
    if ((uint16_t) (inv*(2*n+1)) != 1)
      error("oops, init_mod16bits_inverse_table() not inverse\n");
    mod16bits_inverse_table[n] = inv;
  }
}

/* uint64_modword_inverse(M) returns the multiplicative modular
   inverse of M mod 2^64, meaning inv for which M*inv == 1 mod 2^64.
   M must be odd, since otherwise no such inverse exists.  */
static inline uint64_t
uint64_modword_inverse (uint64_t M)
{
  assert ((M & 1) == 1);
  uint64_t inv = mod16bits_inverse_table[(M>>1) & 0x7FFF]; /* 16 bits */
  inv *= 2 - inv*M;    /* 32 bits */
  inv *= 2 - inv*M;    /* 64 bits */
  assert (inv*M == 1);
  return (inv);
}

/* Return Minv = M^(-1) mod R for use in REDC mod M. */
static inline uint64_t
redc_init_Minv (uint64_t M)
{
  return (uint64_modword_inverse (M));
}

/* For the given x, return REDC form xR = x*R mod M.
   Must have 0 <= x < M.
   This is xR = x*2^64 mod M.
   This "in" conversion is intended for a single x to xR.
   (As opposed to several inputs x of the same M.)  */
static inline uint64_t
redc_in_single (uint64_t x, uint64_t M)
{
  assert (0 <= x && x < M);
  uint64_t  q  ATTRIBUTE_UNUSED;
  uint64_t  r;
  udiv_qrnnd (q,r, x,(uint64_t)0, M);
  return (r);
}

/* Th,Tl are two words of a multiply (Th,TL) = xR*yR.
   Reduce by one factor of R, so zR = xR*yR*R^(-1) mod M = x*y*R mod M.
   This zR is the REDC form of x*y mod M.
   Must have Th < M so the "t" described in the comments above is
   in the range -M < t < M and so uint64_submod() suffices here.  */
static inline uint64_t
redc_reduce (uint64_t Th, uint64_t Tl, uint64_t M, uint64_t Minv)
{
  assert (Th < M);
  assert (M*Minv == 1);

  uint64_t  hi, lo;
  umul_ppmm (hi,lo, Tl*Minv, M);
  assert (lo == Tl);
  uint64_t t = uint64_submod (Th, hi, M);
  assert (0 <= t && t < M);
  return (t);
}

/* For xR,yR REDC forms of x,y, return zR REDC of z = x*y mod M.
   Can have extra constant factors in xR or yR, making them bigger
   than normal residue range 0..M-1, so long as xR*yR < M*2^64.  */
static inline uint64_t
redc_mul (uint64_t xR, uint64_t yR, uint64_t M, uint64_t Minv)
{
  uint64_t  Th,Tl;
  umul_ppmm (Th,Tl, xR, yR);
  return (redc_reduce (Th,Tl, M, Minv));
}

/* For xR REDC form of x, return that x.  */
static inline uint64_t
redc_out (uint64_t xR, uint64_t M, uint64_t Minv)
{
  return (redc_reduce (0,xR, M,Minv));
}

/* Return 2^e mod M, for e >=1.
   M must be odd and fit 63 bits, ie. M < 2^63. */
static uint64_t
uint64_2pow_mod_odd_by_redc (uint64_t e, uint64_t M)
{
  /* Double and step is done together by ret * (2*ret).
     This is within the xR*yR < M*2^64 demanded by redc_mul().
     At most ret = M-1 which is 2*ret^2 <= 2*M^2 - 2*M + 1.
     Then with M < 2^63 have 2*M^2 < M*2^64.

     The most significant 6 bits of e are initial 2^e_high_6 by "%".
     This "%" instead of udiv_qrnnd() lets the compiler see the
     operation so might do it simultaneously with Minv calculation.
     redc_in_single() is a further udiv_qrnnd() though.
  */

  DEBUG(printf("uint64_2pow_mod_odd_by_redc() e=%"PRIu64" M=%"PRIu64"\n", e,M));
  assert (e != 0);  /* not used with e=0 here */
  assert (M & 1);

  int e_clz = uint64_count_leading_zeros(e);
  int e_len = 64-6 - e_clz;  /* bit length of e, sans high 6 bits */
  if (UNLIKELY(e_len <= 0)) {
    assert (1 <= e && e <= 63);
    return (((uint64_t)1 << e) % M);  /* when e <= 6 bits */
  }

  /* Highest 6 bits of e taken together. */
  uint64_t e_high_6 = e >> e_len;   /* high 6 bits */
  e <<= 64 - e_len;                 /* those below now at top */
  assert (32 <= e_high_6 && e_high_6 <= 63);
  uint64_t initial_ret = (uint64_t)1 << e_high_6;
  initial_ret %= M;

  uint64_t Minv = redc_init_Minv (M);
  DEBUG(printf("  Minv = 0x%"PRIX64"\n", Minv));

  /* retR = initial_ret * 2^64 mod M */
  uint64_t retR = redc_in_single (initial_ret, M);

  /* If 0 bit in e then double to ret^2 mod M.
     If 1 bit in e then double and step to 2*ret^2 mod M .
     e is shifted up so the bit tested is at position 63.  */
  do {
    DEBUG(printf("  at e_len=%d e=0x%"PRIX64"\n", e_len, e));
    assert (e_len >= 1);
    retR = redc_mul (retR, retR << (e>>63), M,Minv);
    e <<= 1;   /* next lower bit of e */
  } while (LIKELY(--e_len != 0));

  uint64_t ret = redc_out (retR, M,Minv);
  DEBUG(printf("  final retR = 0x%"PRIX64" is ret=%"PRIu64"\n", retR, ret));
  return (ret);
}


/* ------------------------------------------------------------------------ */

static int option_verbose = 0;
static int option_maxn = INT_MAX;

/* Previously found a[i] = a(i), for 0 <= i < a_len. */
static uint64_t *a;
static size_t    a_len;  /* number of elements in a[] array */

static void
set_a (int n, uint64_t x)
{
  if (n != a_len)
    error("set_a() oops, next term not consecutive\n");
  a = realloc_or_die (a, (a_len+1)*sizeof(a[0]));
  a[a_len++] = x;
}


/* ------------------------------------------------------------------------ */
/* Save State */

static const char saved_state_filename[] = "saved-state.txt";
static const char temp_filename[] = "saved-state.txt.tempfile";
static uint64_t last_num_x = 0;
static uint64_t last_num_steps = 0;
static uint64_t last_cputime = 0;

void
save (int n, uint64_t x)
{
  /* Write first to temp_filename and then rename,
     so atomic replacement.
  */
  FILE *fp = fopen_or_die (temp_filename, "w");
  fprintf(fp, "n = %d\n", n);
  fprintf(fp, "x = %"PRIu64"\n", x);
  int i;
  for (i = 0; i < a_len; i++)
    fprintf(fp, "%d %"PRIu64"\n", i, a[i]);

  /* extra info */

  fprintf(fp, "\n-------------\n");
  fprintf(fp, "x is %d bits\n", uint64_bit_length(x));

  double t = cputime();
  double elapsed_time = t - last_cputime;
  fprintf(fp, "last chunk %.1f seconds CPU time\n", elapsed_time);

  fprintf(fp, "tried %"PRIu64" x by %"PRIu64" steps, average %.2lf steps\n",
          last_num_x, last_num_steps,
          (last_num_x > 0 ? (double) last_num_steps / last_num_x : -1.0));

  double x_per_second = (elapsed_time >= 1
                         ? last_num_x / elapsed_time : -1.0);
  double steps_per_second = (elapsed_time >= 1
                             ? last_num_steps / elapsed_time : -1.0);
  fprintf(fp, "%.2lf million x/second, %.2lf million steps/second\n",
          x_per_second/1e6,
          steps_per_second/1e6);

  ferror_die (fp, temp_filename, "writing");
  fclose_or_die (fp, temp_filename);
  rename_or_die (temp_filename, saved_state_filename);

  if (option_verbose >= 2)
    printf("save to %s  n=%d x=%"PRIu64"\n",
           saved_state_filename, n, x);

  last_cputime = cputime();
  last_num_x = 0;
  last_num_steps = 0;
}

static volatile sig_atomic_t flag_interrupt = 0;
static volatile sig_atomic_t flag_terminate = 0;

void
sig_handler_save_state (int signum)
{
  if (option_verbose >= 2)
    printf("signal %d received for save state\n", signum);
  flag_interrupt = 1;
}
void
sig_handler_terminate (int signum)
{
  if (option_verbose)
    printf("signal %d received for terminate\n", signum);
  flag_interrupt = 1;
  flag_terminate = 1;
}

/* Initialise handlers for saving on SIGINT, SIGHUP, SIGTERM,
   and on a timer.
*/
void
init_save ()
{
  struct sigaction s;
  s.sa_handler = sig_handler_terminate;
  sigfillset(&s.sa_mask);
  s.sa_flags = SA_RESTART;
  sigaction_or_die(SIGINT,  &s, NULL);
  sigaction_or_die(SIGHUP,  &s, NULL);
  sigaction_or_die(SIGTERM, &s, NULL);

  s.sa_handler = sig_handler_save_state;
  sigaction_or_die(SIGALRM, &s, NULL);

  struct itimerval t;
  t.it_interval.tv_sec = 5*60;  /* 5 minutes */
  t.it_interval.tv_usec = 0;
  t.it_value = t.it_interval;
  setitimer_or_die(ITIMER_REAL, &t, NULL);

  last_cputime = cputime();
}


/* ------------------------------------------------------------------------ */
/* Sample Data */

static const uint64_t A372707_data[] = {
  0, 1, 3,
  18, 35, 207, 774, 1513, 2051, 8900, 13459, 25907, 34305,
  88036, 153839, 382283, 397590, 636459, 844177, 1456073, 2426735,
  7312307, 11716125,
  14657311,
  43165242,
  52706549,
  84955821,    /* n=26 */
  188736643,   /* n=27 */
  416569989,
  953350161,
  1617152961,  /* n=30 */
  2541237149,  /* n=31 */
  4847485412,  /* n=32 */

  /* bfile */
  9171340873,     /* a(33) */
  15795226813,    /* a(34) */
  20020973943,    /* a(35) */
  39869603498,    /* a(36) */
  74028396502,    /* a(37) */
  79091378014,    /* a(38) */
  80545682190,    /* a(39) */
  168185721297,   /* a(40) */
  971825121269,   /* a(41) */
  1622801529123,  /* a(42) */
  2121667242973,  /* a(43) */
  2217254553455,  /* a(44) */
  2417405690903   /* a(45) */
};

static void
check_A372707 (int n, uint64_t x, const char *where)
{
  if (n < 0) error("oops, check_A372707() is for n>=0\n");
  if (n >= numberof(A372707_data)) {
    if (option_verbose) printf("  (beyond sample data)\n");
    return;
  }
  if (x == A372707_data[n]) {
    if (option_verbose)
      printf("  (%sgood vs sample data)\n", where);
    return;
  }
  error("oops %sn=%d, got %"PRIu64" want %"PRIu64"\n",
        where, n, x, A372707_data[n]);
}


/* ------------------------------------------------------------------------ */

static inline uint64_t
step_by_div (uint64_t x)
{
  assert (x != 0);
  return (uint64_2pow_mod_by_div (x,x));
}

static inline uint64_t
step_by_redc (uint64_t x)
{
  /* REDC is for odd modulus.
     If x even then shift it down to Xodd = x >> shift.
     Then calculate in exponent Eodd = x - shift;
     Resulting power is Yodd == 2^Eodd mod Xodd.
     y = Yodd * 2^shift is then y == 2^x mod x.
  */
  assert (x != 0);
  int shift = uint64_count_trailing_zeros(x);
  return (uint64_2pow_mod_odd_by_redc (x - shift, x >> shift)
          << shift);
}

static inline uint64_t
step (uint64_t x)
{
#if USE_REDC
  return (step_by_redc(x));
#else
  return (step_by_div(x));
#endif
}

/* Number of steps x -> step(x) needed for x to reach 0. */
static int
num_steps_of_x (uint64_t x)
{
  int n;
  for (n=0; x != 0; n++)
    x = step(x);
  return (n);
}

/* Check that each A372707_data[n] does reach 1 after n steps. */
static void
check_A372707_data_vs_num_steps (void)
{
  int n;
  for (n=1; n < numberof(A372707_data); n++) {
    uint64_t x = A372707_data[n];
    int got_n = num_steps_of_x(x);
    if (n != got_n)
      error("oops, A372707_data[] n=%d is x=%"PRIu64" but only takes %d steps\n", n, x, got_n);
  }
  if (option_verbose)
    printf("A372707_data[] good vs num_steps_of_x()\n");
}

/* Return 1 if x requires n steps to reach 0. */
static inline int
is_n_steps (int n, uint64_t x)
{
  DEBUG(printf("is_n_steps() n=%d x=%"PRIu64" within a_len = %zu\n",
               n, x, a_len));
  assert (n>=0);
  assert (n == a_len);
  last_num_x++;

  for ( ; LIKELY(n > 0); n--) {
    /* want n more steps */
    DEBUG(printf("  at n=%d x=%"PRIu64"\n", n, x));
    if (UNLIKELY(x==0)) {
      DEBUG(printf("  no, reached 0 before n steps\n"));
      return (0);
    }

    x = step(x);
    last_num_steps++;

    assert (0 <= n-1 && n-1 < a_len);
    if (LIKELY (x < a[n-1])) {
      DEBUG(printf("  no, x=%"PRIu64" < a(n-1) = %"PRIu64"\n",
                   x, a[n-1]));
      return(0);
    }
  }
  DEBUG(printf("  yes\n"));
  return (1);
}

void
show_configuration (void)
{
  static int done = 0;
  if (option_verbose && ! done) {
    done = 1;
#ifdef __GNUC__
    printf("Have __GNUC__\n");
#else
    printf("Not __GNUC__\n");
#endif
    printf("assert()s %s\n",
           WANT_ASSERT ? "enabled" : "disabled");
    printf("LONGLONG_MACROS_TYPE = %s\n", LONGLONG_MACROS_TYPE);
    printf("USE_REDC  %d\n", USE_REDC);
  } else {
    assert (printf("assert()s enabled\n"));
  }
}

static void
output (int n, uint64_t x)
{
  printf("%d %"PRIu64"\n", n, x);
  check_A372707 (n,x, "");
  ferror_die (stdout, "stdout", "writing");
}

/* Look for a(n) and onwards, with first candidate x.
   Must have a_len = n, meaning preceding a[0..n-1] all known. */
void
search_from (int n, uint64_t x)
{
  /* step(x) is a strictly decreasing function of x so that a(n)
     is a strictly increasing function of n.
     So it's enough to seek a single target next n.  */

  if (option_verbose)
    printf("search from n=%d x=%"PRIu64"\n", n,x);
  if (n > option_maxn) return;
  if (a_len != n) error("oops, search_from() must have a[0..n-1]\n");

  for ( ; LIKELY( (x & UINT64_HIGHBIT)==0 ); x++) {
    if (UNLIKELY(flag_interrupt)) {
      flag_interrupt = 0;
      save (n, x);
      if (flag_terminate) return;
    }

    if (is_n_steps (n, x)) {
      output (n, x);
      if (n >= option_maxn) return;
      set_a (n, x);
      n++;
    }
  }
  error ("stop, x bigger than 63 bits\n");
}

void
resume (void)
{
  if (option_verbose)
    printf("resume from %s\n", saved_state_filename);

  FILE *fp = fopen_or_die (saved_state_filename, "r");
  int n;
  uint64_t x;
  if (fscanf(fp, "n = %d\nx = %"SCNu64"\n", &n, &x) != 2)
    error("%s format unrecognised\n", saved_state_filename);
  int i;
  uint64_t prev;
  while (fscanf(fp, "%d %"SCNu64"\n", &i, &prev) == 2) {
    if (option_verbose)
      printf("resume a(%d) = %"PRIu64"\n", i, prev);
    check_A372707 (i,prev, "resume data ");
    set_a (i,prev);
  }
  if (n != i+1)
    error("%s previous terms should end at n-1 = %d\n",
          saved_state_filename, n-1);

  ferror_die (fp, saved_state_filename, "reading");
  fclose_or_die (fp, saved_state_filename);

  search_from (n, x);
}

int
main (int argc, char *argv[])
{
  DEBUG (option_verbose = 1);
  setbuf(stdout,NULL);
  init_mod16bits_inverse_table();
  init_save();
  int option_resume = 0;
  int i, endpos;

  for (i = 1; i < argc; i++) {
    const char *arg = argv[i];
    if (strcmp(arg,"-v")==0) {
      option_verbose++;
    } else if (strcmp(arg,"start")==0) {
      option_resume = 0;
    } else if (strcmp(arg,"resume")==0) {
      option_resume = 1;
    } else if (sscanf(arg, "maxn=%d%n", &option_maxn, &endpos) == 1
               && endpos == strlen(arg)) {
    } else {
      error("Unrecognised command line option: %s\n", arg);
    }
  }

  show_configuration();
  check_A372707_data_vs_num_steps();
  if (option_resume)
    resume();
  else
    search_from (0,0);
  return(0);
}