Date: Sat, 12 Jun 2004 12:46:17 -0400 From: Russ Cox <russcox(AT)gmail.com> Subject: Number of longest common substrings problem The C program below computes the initial terms of this sequence, for a two-character alphabet or an arbitrary alphabet. For a two-character alphabet: len 1: 1 lcs of length 1 for a a len 2: 2 lcs of length 1 for aa ab len 3: 4 lcs of length 2 for aab abb len 4: 10 lcs of length 2 for abba baab len 5: 24 lcs of length 2 for abbba baaab len 6: 46 lcs of length 3 for aabbba abaaab len 7: 100 lcs of length 4 for aaaaabb aabbbbb len 8: 225 lcs of length 4 for aaaaaabb aabbbbbb len 9: 525 lcs of length 5 for aaaaaaabb aaabbbbbb len 10: 1225 lcs of length 6 for aaaaaaabbb aaabbbbbbb len 11: 3136 lcs of length 6 for aaaaaaaabbb aaabbbbbbbb len 12: 7056 lcs of length 7 for aaaaaaaaabbb aaaabbbbbbbb The search finds the alphabetically earliest pair of strings that exhibit the maximum. It's interesting that len 4, 5, 6 don't follow the same string pattern as all the rest. Over alphabets of arbitrary size, the results match the above through len 6. Beyond that, the search takes longer than I am patient. The sequence 1,2,4,10,24,46, ... is now A094837. Russ #include <stdio.h> #include <stdlib.h> #include <string.h> #define MAXN 20 #define debug 1 typedef struct Match Match; struct Match { int len; int count; }; /* * Compute number of lcs for a, b using dynamic programming. * The lcs for the substrings a[0:i] b[0:j] is one of * * if a[i]==b[j], lcs(a[0:i-1], b[0:j-1])+a[i] * lcs(a[0:i-1], b[0:j]) * lcs(a[0:i], b[0:j-1]) * * We can compute the number of lcs by just adding up the * number from each place, except that if we take them from * lcs(a[0:i-1], b[0:j]) and lcs(a[0:i], b[0:j-1]) and those in turn * didn't actually add any new ones, then we'll double-count * the ones from lcs(a[0:i-1], b[0:j-1]), so we need to subtract * those out. */ int numlcs(char *a, char *b, int n, int *plen) { int i, j, d, t, len, count; Match match[MAXN][MAXN]; for(d=0; d<2*n-1; d++){ i = 0; j = d-i; if(j >= n){ j = n-1; i = d-j; } for(; i<n && j>=0; i++, j--){ /* start with nothing */ len = 0; count = 0; /* try adding to a previous match */ if(a[i] == b[j]){ if(i>0 && j>0){ len = match[i-1][j-1].len+1; count = match[i-1][j-1].count; }else{ len = 1; count = 1; } } /* try matches not containing a[i] */ if(i>0){ if((t=match[i-1][j].len) > len){ len = t; count = match[i-1][j].count; }else if(t == len) count += match[i-1][j].count; } /* try matches not containing b[j] */ if(j>0){ if((t=match[i][j-1].len) > len){ len = t; count = match[i][j-1].count; }else if(t == len) count += match[i][j-1].count; } /* we may have double-counted matches not containing either */ if(i>0 && j>0 && match[i-1][j-1].len==len) count -= match[i-1][j-1].count; match[i][j].len = len; match[i][j].count = count; } } if(plen) *plen = match[n-1][n-1].len; return match[n-1][n-1].count; } /* * Enumerate all pairs of strings of length n over an alphabet of size nalpha * and run numlcs on them. buf contains the concatenation of the two strings; * only nbuf of the characters have been filled in so far. The nbuf characters * only use the first nused characters in the alphabet, so we need only * consider the first nused+1 alphabet characters as next possible. * * Accumulate the current max in bestcount, bestlen, besta, bestb. */ int bestcount; int bestlen; char besta[MAXN+1]; char bestb[MAXN+1]; char *alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; void search(int nalpha, int nused, int n, char *buf, int nbuf) { int count, len; char *p, *ep; if(nbuf == 2*n){ count = numlcs(buf, buf+n, n, &len); if(count > bestcount){ bestcount = count; bestlen = len; memmove(besta, buf, n); memmove(bestb, buf+n, n); } }else{ ep = alpha+nalpha; if(nused < nalpha) ep = alpha+nused+1; for(p=alpha; p<ep; p++){ buf[nbuf] = *p; search(nalpha, (p<alpha+nused ? nused : nused+1), n, buf, nbuf+1); } } } void usage(void) { fprintf(stderr, "usage: lcs count aabaa aaabb\n"); fprintf(stderr, " or lcs search n\n"); fprintf(stderr, " or lcs search2 n\n"); exit(1); } int main(int argc, char **argv) { int i, n, len; char buf[2*MAXN]; if(argc < 2) usage(); if(strcmp(argv[1], "count") == 0){ if(argc != 4) usage(); n = strlen(argv[2]); if(n != strlen(argv[3])){ fprintf(stderr, "string lengths must match\n"); exit(1); } n = numlcs(argv[2], argv[3], n, &len); printf("%d of length %d\n", n, len); }else if(strcmp(argv[1], "search") == 0){ if(argc != 3) usage(); n = atoi(argv[2]); search(2*n, 0, n, buf, 0); printf("len %d: %d lcs of length %d for %s %s (all alphabets)\n", n, bestcount, bestlen,besta, bestb); }else if(strcmp(argv[1], "search2") == 0){ if(argc != 3) usage(); n = atoi(argv[2]); search(2, 0, n, buf, 0); printf("len %d: %d lcs of length %d for %s %s (just alphabet=ab)\n", n, bestcount, bestlen, besta, bestb); }else usage(); return 0; }