This site is supported by donations to The OEIS Foundation.
User:Antti Karttunen/awkarttu.buffaloes.py
From OeisWiki
This is a Python source code that I wrote in February 2008 as my answer to the assignment 3 of Language Technology course Clt233 at the Deparment of General Linguistics at University of Helsinki. See also Wikipedia-article "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo" and page User:Antti Karttunen/Etsivät etsivät etsivät.
The Natural Language Toolkit can be downloaded from http://nltk.org/.
#!/usr/bin/python # -*- coding: utf8 -*- # Scandinavian letters: "ä" and "ö". # Antti Karttunen's answer to the assignment 3 of course Clt233. # See the end of module for the proof that the number of # parsing alternatives of the sentence buffalo^n with buffalo2grammar # indeed follow the sequence A007477[n-1] for n >= 3 onward. import nltk import re import sys def parse_sentences(grammar,word,up_to_n_times,verbose): '''Parses sentences formed from 1 to 'up_to_n_times' 'word's with 'grammar'.''' # parser = nltk.RecursiveDescentParser(grammar) # parser = nltk.ChartParser(grammar, nltk.parse.BU_STRATEGY) # , trace=2) parser = nltk.ChartParser(grammar, nltk.parse.TD_STRATEGY) # , trace=2) sent = "" seq = [] for n in range(up_to_n_times): if(len(sent) > 0): sent += " " sent += word try: parse_list = parser.nbest_parse(sent.split()) except ValueError: # Grammar does not cover some of the input words? parse_list = [] n_alts = len(parse_list) seq.append(n_alts) if(0 == n_alts): sys.stdout.write("NOT RECOGNIZED BY THIS GRAMMAR: " + sent + "\n") elif(verbose): sys.stdout.write("SENTENCE: " + sent + " WAS PARSED IN " + str(n_alts) + " DIFFERENT WAYS:\n") for p in parse_list: print p sys.stdout.write("\n") sys.stdout.flush() sys.stdout.write("Sequence grows as: " + str(seq) + "\n") # The end of function ######################################################################## # Part 1: The "Buffalo" sentence # ######################################################################## buffalo1grammar = nltk.parse_cfg(""" S -> NP VP PN -> "Buffalo" N -> "buffalo" V -> "buffalo" NP -> PN N | NP RC RC -> NP V VP -> V NP """) parse_sentences(buffalo1grammar,"Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo",1,True) ######################################################################## # Part 2 & 3: More buffaloes & More and more buffaloes # ######################################################################## buffalo2grammar = nltk.parse_cfg(""" S -> NP VP PN -> "buffalo" N -> "buffalo" V -> "buffalo" NP -> N | PN N | NP RC RC -> NP V VP -> V NP """) # parse_sentences(buffalo2grammar,"buffalo",5,True) parse_sentences(buffalo2grammar,"buffalo",12,False) # This gives: # 0,0,1,2,3,6,11,22,44,90,187,392,832,1778,3831,8304,18104,39666,... # # Compare with OEIS sequence A007477 with description # "Shifts 2 places left when convolved with itself.": # 1, 1, 1, 2, 3, 6, 11, 22, 44, 90, 187, 392, 832, 1778, 3831, 8304, 18104, # 39666, 87296, 192896, 427778, 951808, 2124135, 4753476, 10664458, 23981698, # 54045448, 122041844, 276101386, 625725936, 1420386363 # http://www.research.att.com/~njas/sequences/A007477 # and in the Encyclopedia of Combinatorial Structures as: # http://algo.inria.fr/bin/encyclopedia?Search=ECSnb&argsearch=441 # with the structure: # [S, {S = Union(Z,Prod(Z,Union(Z,Prod(S,S))))}, unlabelled] # See also N. S. S. Gu, N. Y. Li and T. Mansour, # 2-Binary trees: bijections and related issues, # Discr. Math., 308 (2008), 1209-1221. # Paper available at: # http://www.combinatorics.net.cn/research/Papers_GetFile.aspx?paperID=269 # Have to check the equivalence! If true, then there are # A007477[24-1] = 4753476 ways to parse the sentence of buffalo^24 # with buffalo2grammar. (My laptop is TOO SLOW TO COUNT THAT EMPIRICALLY!) # Note that the other sequences are not yet in OEIS. (I will add them # in my due time...) ######################################################################## # Bonus, allowing single "buffalo" stand as a verb (imperative form) # ######################################################################## buffalo3grammar = nltk.parse_cfg(""" S -> VP | NP VP PN -> "buffalo" N -> "buffalo" V -> "buffalo" NP -> N | PN N | NP RC RC -> NP V VP -> V NP | V """) parse_sentences(buffalo3grammar,"buffalo",12,False) ######################################################################## # Bonus, "etsivät" grammar: # ######################################################################## etsivat1grammar_org = nltk.parse_cfg(""" S -> NP V NP | NP V PresParticiple -> "etsivät" N -> "etsivät" V -> "etsivät" NP -> N | PresParticiple N | NP PresParticiple NP """) etsivat1grammar_long = nltk.parse_cfg(""" S -> NP V NP | NP V PresentParticiple -> "etsivät" N -> "etsivät" V -> "etsivät" Participle_Phrase -> PresentParticiple N | NP PresentParticiple NP NP -> N | Participle_Phrase """) etsivat1grammar = nltk.parse_cfg(""" S -> NP V NP | NP V P -> "etsivät" N -> "etsivät" V -> "etsivät" NP -> N | P N | NP P NP """) # PP -> P N | NP P NP # NP -> N | PP # Note that in the last case: NP PresParticiple NP # the first NP is always patient of the action, # and the second is the agent # (i.e. "A:t etsivät B:t" means "As seeking Bs", or "B's that seek A's") # parse_sentences(etsivat1grammar,"etsivät",7,True) parse_sentences(etsivat1grammar,"etsivät",6,True) # parse_sentences(etsivat1grammar,"etsivät",12,False) # Note, with that grammar: # SENTENCE: "etsivät etsivät etsivät etsivät etsivät etsivät etsivät" # IS PARSED IN 17 DIFFERENT WAYS: # (S # (NP (NP (N etsivät)) (PresParticiple etsivät) (NP (N etsivät))) # (V etsivät) # (NP (NP (N etsivät)) (PresParticiple etsivät) (NP (N etsivät)))) # This makes sense: # "The detectives who seek detectives seek detectives who seek detectives." # (S # (NP # (NP # (NP (N etsivät)) # (PresParticiple etsivät) # # (NP (N etsivät))) # (PresParticiple etsivät) # (NP (N etsivät))) # (V etsivät) # (NP (N etsivät))) # "The detectives who seek detectives investigating detectives seek detectives". # (S # (NP # (NP (PresParticiple etsivät) (N etsivät)) # (PresParticiple etsivät) # (NP (PresParticiple etsivät) (N etsivät))) # # (V etsivät) # (NP (N etsivät))) # This makes sense as well: # "The investigating detectives who seek investigating detectives seek detectives." # Or "Investigating detectives seeking investigating detectives seek detectives." # binarytree1grammar = nltk.parse_cfg(""" B -> Z | B B Z -> "L" """) # parse_sentences(binarytree1grammar,"L",5,True) parse_sentences(binarytree1grammar,"L",10,False) # and we get Catalans (A000108) as we should! # Now we convert this combstruct definition of A007477, # i.e. ECS #441: # [S, {S = Union(Z,Prod(Z,Union(Z,Prod(S,S))))}, unlabelled] # by this intermediate "expanded version" # [S, {S = Union(Z,Prod(Z,S2)), S2 = Union(Z,Prod(S,S))}, unlabelled] # to a NLTK-grammar: ECS441grammar = nltk.parse_cfg(""" S -> Z | Z S2 S2 -> Z | S S Z -> "L" """) parse_sentences(ECS441grammar,"L",4,True) parse_sentences(ECS441grammar,"L",10,False) # And indeed we get: # Sequence grows as: [1, 1, 1, 2, 3, 6, 11, 22, 44, 90] # i.e. the initial terms of http://www.research.att.com/~njas/sequences/A007477 # Or ACTUALLY, the terms of # http://algo.inria.fr/bin/encyclopedia?Search=ECSnb&argsearch=441 # from the offset n=1 onward. # [1, 1, 1, 2, 3, 6, 11, 22, 44, 90, 187, 392, 832, 1778, 3831, 8304, 18104, 39666, 87296, 192896] # So, this grammar gives us essentially three kinds of productions: # # # S S S # | / \ / \ # Z Z S2 Z S2 # | / \ # Z S S # # Compare with the productions of buffalo2grammar: # S -> NP VP # PN -> "buffalo" # N -> "buffalo" # V -> "buffalo" # NP -> N | PN N | NP RC # RC -> NP V # VP -> V NP # # (first for its NP): # # NP NP NP # | / \ / \ # N PN N NP RC # / \ # NP V # # All leaves (N, PN & V) are buffaloes. # Two first productions are isomorphic with two first productions # of ECS441grammar, and the third one has likewise one leaf # and two further branches of similar structures. # Now the toplevel S production of buffalo2grammar, S -> NP VP # is expanded as: # # # S # / \ # NP VP # / \ # V NP # # again with one leaf (V) and two branches to NP's. # (reflect the right-hand-side subtree, and you get binary tree # isomorphic with the last production for NP, above). # So, apart for structures of size 1 and 2, buffalo2grammar # should generate the same sequence as ECS441grammar: # [0, 0, 1, 2, 3, 6, 11, 22, 44, 90, 187, 392] # as it indeed does. # Q.E.D. # (Antti Karttunen, Porvoo, February 28 2008). # # # Thus we have zero alternatives for "buffalo" and "buffalo buffalo", # and then the values A007477[2] .. A007477[23] # give the values for alternative parsings of sentences from # buffalo^3 to buffalo^24: # # 1 # 2 # 3 # 6 # 11 # 22 # 44 # 90 # 187 # 392 # 832 # 1778 # 3831 # 8304 # 18104 # 39666 # 87296 # 192896 # 427778 # 951808 # 2124135 # 4753476 # # Converted from Maple procedure definition: # A007477 := proc(n) option remember; local k; if n <= 1 then 1 else add(A007477(k)*A007477(n-k-2), k=0..n-2); fi; end; # from array import * def gen_n_items(n,item): for i in xrange(n): yield(item) def memoed(fun): maxmemosize = 131 # 072 notfilled = 0L memo = array('L') def wrapper(n): memsiznow = len(memo) if None == n: return(memo) # For debugging elif n >= memsiznow: newsize = min(maxmemosize,max(n+1,2*memsiznow)) memo.extend(gen_n_items(newsize-memsiznow,notfilled)) res = memo[n] if(notfilled == res): res = fun(n) memo[n] = res return(res) else: return(res) return wrapper # With memoed we get results MUCH faster: @memoed def A007477(n): '''Compute A007477(n)''' if(n < 2): return(1) else: s = 0 for k in range(n-1): s += (A007477(k)*A007477(n-k-2)) return(s) sys.stdout.write("A007477[0..31] follows: " + str([A007477(n) for n in range(31)]) + "\n")