Date: Sat, 12 Jun 2004 12:46:17 -0400
From: Russ Cox <russcox(AT)gmail.com>
Subject: Number of longest common substrings problem

The C program below computes the initial terms
of this sequence, for a two-character alphabet or an
arbitrary alphabet.  For a two-character alphabet:

len 1: 1 lcs of length 1 for a a
len 2: 2 lcs of length 1 for aa ab
len 3: 4 lcs of length 2 for aab abb
len 4: 10 lcs of length 2 for abba baab
len 5: 24 lcs of length 2 for abbba baaab
len 6: 46 lcs of length 3 for aabbba abaaab
len 7: 100 lcs of length 4 for aaaaabb aabbbbb
len 8: 225 lcs of length 4 for aaaaaabb aabbbbbb
len 9: 525 lcs of length 5 for aaaaaaabb aaabbbbbb
len 10: 1225 lcs of length 6 for aaaaaaabbb aaabbbbbbb
len 11: 3136 lcs of length 6 for aaaaaaaabbb aaabbbbbbbb
len 12: 7056 lcs of length 7 for aaaaaaaaabbb aaaabbbbbbbb

The search finds the alphabetically earliest pair of strings
that exhibit the maximum.  It's interesting that len 4, 5, 6 don't
follow the same string pattern as all the rest.

Over alphabets of arbitrary size, the results match the above
through len 6.  Beyond that, the search takes longer than
I am patient.

The sequence 1,2,4,10,24,46, ... is now A094837.

Russ

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXN 20
#define debug 1

typedef struct Match Match;
struct Match
{
       int len;
       int count;
};

/*
 * Compute number of lcs for a, b using dynamic programming.
 * The lcs for the substrings a[0:i] b[0:j] is one of
 *
 *      if a[i]==b[j], lcs(a[0:i-1], b[0:j-1])+a[i]
 *      lcs(a[0:i-1], b[0:j])
 *      lcs(a[0:i], b[0:j-1])
 *
 * We can compute the number of lcs by just adding up the
 * number from each place, except that if we take them from
 * lcs(a[0:i-1], b[0:j]) and lcs(a[0:i], b[0:j-1]) and those in turn
 * didn't actually add any new ones, then we'll double-count
 * the ones from lcs(a[0:i-1], b[0:j-1]), so we need to subtract
 * those out.
 */
int
numlcs(char *a, char *b, int n, int *plen)
{
       int i, j, d, t, len, count;
       Match match[MAXN][MAXN];

       for(d=0; d<2*n-1; d++){
               i = 0;
               j = d-i;
               if(j >= n){
                       j = n-1;
                       i = d-j;
               }
               for(; i<n && j>=0; i++, j--){
                       /* start with nothing */
                       len = 0;
                       count = 0;

                       /* try adding to a previous match */
                       if(a[i] == b[j]){
                               if(i>0 && j>0){
                                       len = match[i-1][j-1].len+1;
                                       count = match[i-1][j-1].count;
                               }else{
                                       len = 1;
                                       count = 1;
                               }
                       }

                       /* try matches not containing a[i] */
                       if(i>0){
                               if((t=match[i-1][j].len) > len){
                                       len = t;
                                       count = match[i-1][j].count;
                               }else if(t == len)
                                       count += match[i-1][j].count;
                       }

                       /* try matches not containing b[j] */
                       if(j>0){
                               if((t=match[i][j-1].len) > len){
                                       len = t;
                                       count = match[i][j-1].count;
                               }else if(t == len)
                                       count += match[i][j-1].count;
                       }

                       /* we may have double-counted matches not
containing either */
                       if(i>0 && j>0 && match[i-1][j-1].len==len)
                               count -= match[i-1][j-1].count;

                       match[i][j].len = len;
                       match[i][j].count = count;
               }
       }

       if(plen)
               *plen = match[n-1][n-1].len;
       return match[n-1][n-1].count;
}

/*
 * Enumerate all pairs of strings of length n over an alphabet of size nalpha
 * and run numlcs on them.  buf contains the concatenation of the two strings;
 * only nbuf of the characters have been filled in so far.  The nbuf characters
 * only use the first nused characters in the alphabet, so we need only
 * consider the first nused+1 alphabet characters as next possible.
 *
 * Accumulate the current max in bestcount, bestlen, besta, bestb.
 */
int bestcount;
int bestlen;
char besta[MAXN+1];
char bestb[MAXN+1];
char *alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

void
search(int nalpha, int nused, int n, char *buf, int nbuf)
{
       int count, len;
       char *p, *ep;

       if(nbuf == 2*n){
               count = numlcs(buf, buf+n, n, &len);
               if(count > bestcount){
                       bestcount = count;
                       bestlen = len;
                       memmove(besta, buf, n);
                       memmove(bestb, buf+n, n);
               }
       }else{
               ep = alpha+nalpha;
               if(nused < nalpha)
                       ep = alpha+nused+1;
               for(p=alpha; p<ep; p++){
                       buf[nbuf] = *p;
                       search(nalpha, (p<alpha+nused ? nused : nused+1),
                               n, buf, nbuf+1);
               }
       }
}

void
usage(void)
{
       fprintf(stderr, "usage: lcs count aabaa aaabb\n");
       fprintf(stderr, "  or   lcs search n\n");
       fprintf(stderr, "  or   lcs search2 n\n");
       exit(1);
}

int
main(int argc, char **argv)
{
       int i, n, len;
       char buf[2*MAXN];

       if(argc < 2)
               usage();

       if(strcmp(argv[1], "count") == 0){
               if(argc != 4)
                       usage();
               n = strlen(argv[2]);
               if(n != strlen(argv[3])){
                       fprintf(stderr, "string lengths must match\n");
                       exit(1);
               }
               n = numlcs(argv[2], argv[3], n, &len);
               printf("%d of length %d\n", n, len);
       }else if(strcmp(argv[1], "search") == 0){
               if(argc != 3)
                       usage();
               n = atoi(argv[2]);
               search(2*n, 0, n, buf, 0);
               printf("len %d: %d lcs of length %d for %s %s (all alphabets)\n",
                       n, bestcount, bestlen,besta, bestb);
       }else if(strcmp(argv[1], "search2") == 0){
               if(argc != 3)
                       usage();
               n = atoi(argv[2]);
               search(2, 0, n, buf, 0);
               printf("len %d: %d lcs of length %d for %s %s (just alphabet=ab)\n",
                       n, bestcount, bestlen, besta, bestb);
       }else
               usage();
       return 0;
}