This site is supported by donations to The OEIS Foundation.

User:Antti Karttunen/awkarttu.buffaloes.py

From OeisWiki
Jump to: navigation, search

This is a Python source code that I wrote in February 2008 as my answer to the assignment 3 of Language Technology course Clt233 at the Deparment of General Linguistics at University of Helsinki. See also Wikipedia-article "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo" and page User:Antti Karttunen/Etsivät etsivät etsivät.

The Natural Language Toolkit can be downloaded from http://nltk.org/.

#!/usr/bin/python
# -*- coding: utf8 -*- 
# Scandinavian letters: "ä" and "ö".

# Antti Karttunen's answer to the assignment 3 of course Clt233.
# See the end of module for the proof that the number of
# parsing alternatives of the sentence buffalo^n with buffalo2grammar
# indeed follow the sequence A007477[n-1] for n >= 3 onward.

import nltk
import re
import sys



def parse_sentences(grammar,word,up_to_n_times,verbose):
  '''Parses sentences formed from 1 to 'up_to_n_times' 'word's with 'grammar'.'''

# parser = nltk.RecursiveDescentParser(grammar)
# parser = nltk.ChartParser(grammar, nltk.parse.BU_STRATEGY) # , trace=2)
  parser = nltk.ChartParser(grammar, nltk.parse.TD_STRATEGY) # , trace=2)


  sent = ""
  seq = []

  for n in range(up_to_n_times):

    if(len(sent) > 0): sent += " "
    sent += word

    try:
      parse_list = parser.nbest_parse(sent.split())
    except ValueError: # Grammar does not cover some of the input words?
      parse_list = []

    n_alts = len(parse_list)

    seq.append(n_alts)

    if(0 == n_alts):
      sys.stdout.write("NOT RECOGNIZED BY THIS GRAMMAR: " + sent + "\n")
    elif(verbose):
      sys.stdout.write("SENTENCE: " + sent + " WAS PARSED IN " +
                        str(n_alts) + " DIFFERENT WAYS:\n")
      for p in parse_list:
        print p
      sys.stdout.write("\n")

    sys.stdout.flush()

  sys.stdout.write("Sequence grows as: " + str(seq) + "\n")

# The end of function

########################################################################
# Part 1: The "Buffalo" sentence                                       #
########################################################################

buffalo1grammar = nltk.parse_cfg("""
  S -> NP VP
  PN -> "Buffalo"
  N -> "buffalo"
  V -> "buffalo"
  NP -> PN N | NP RC
  RC -> NP V
  VP -> V NP
  """)


parse_sentences(buffalo1grammar,"Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo",1,True)


########################################################################
# Part 2 & 3: More buffaloes & More and more buffaloes                 #
########################################################################

buffalo2grammar = nltk.parse_cfg("""
  S -> NP VP
  PN -> "buffalo"
  N -> "buffalo"
  V -> "buffalo"
  NP -> N | PN N | NP RC
  RC -> NP V
  VP -> V NP
  """)

# parse_sentences(buffalo2grammar,"buffalo",5,True)

parse_sentences(buffalo2grammar,"buffalo",12,False)

# This gives:
# 0,0,1,2,3,6,11,22,44,90,187,392,832,1778,3831,8304,18104,39666,...
#
# Compare with OEIS sequence A007477 with description
# "Shifts 2 places left when convolved with itself.":
# 1, 1, 1, 2, 3, 6, 11, 22, 44, 90, 187, 392, 832, 1778, 3831, 8304, 18104,
# 39666, 87296, 192896, 427778, 951808, 2124135, 4753476, 10664458, 23981698,
# 54045448, 122041844, 276101386, 625725936, 1420386363

# http://www.research.att.com/~njas/sequences/A007477
# and in the Encyclopedia of Combinatorial Structures as:
# http://algo.inria.fr/bin/encyclopedia?Search=ECSnb&argsearch=441
# with the structure:
# [S, {S = Union(Z,Prod(Z,Union(Z,Prod(S,S))))}, unlabelled]

# See also N. S. S. Gu, N. Y. Li and T. Mansour,
# 2-Binary trees: bijections and related issues,
# Discr. Math., 308 (2008), 1209-1221.
# Paper available at:
# http://www.combinatorics.net.cn/research/Papers_GetFile.aspx?paperID=269

# Have to check the equivalence! If true, then there are
# A007477[24-1] = 4753476 ways to parse the sentence of buffalo^24
# with buffalo2grammar. (My laptop is TOO SLOW TO COUNT THAT EMPIRICALLY!)

# Note that the other sequences are not yet in OEIS. (I will add them
# in my due time...)

########################################################################
# Bonus, allowing single "buffalo" stand as a verb (imperative form)   #
########################################################################

buffalo3grammar = nltk.parse_cfg("""
  S -> VP | NP VP
  PN -> "buffalo"
  N -> "buffalo"
  V -> "buffalo"
  NP -> N | PN N | NP RC
  RC -> NP V
  VP -> V NP | V
  """)

parse_sentences(buffalo3grammar,"buffalo",12,False)


########################################################################
# Bonus, "etsivät" grammar:                                           #
########################################################################

etsivat1grammar_org = nltk.parse_cfg("""
  S -> NP V NP | NP V
  PresParticiple -> "etsivät"
  N -> "etsivät"
  V -> "etsivät"
  NP -> N | PresParticiple N | NP PresParticiple NP
  """)


etsivat1grammar_long = nltk.parse_cfg("""
  S -> NP V NP | NP V
  PresentParticiple -> "etsivät"
  N -> "etsivät"
  V -> "etsivät"
  Participle_Phrase -> PresentParticiple N | NP PresentParticiple NP
  NP -> N | Participle_Phrase
  """)

etsivat1grammar = nltk.parse_cfg("""
  S -> NP V NP | NP V
  P -> "etsivät"
  N -> "etsivät"
  V -> "etsivät"
  NP -> N | P N | NP P NP
  """)

#  PP -> P N | NP P NP
#  NP -> N | PP

# Note that in the last case: NP PresParticiple NP
# the first NP is always patient of the action,
# and the second is the agent
# (i.e. "A:t etsivät B:t" means "As seeking Bs", or "B's that seek A's")

# parse_sentences(etsivat1grammar,"etsivät",7,True)

parse_sentences(etsivat1grammar,"etsivät",6,True)
# parse_sentences(etsivat1grammar,"etsivät",12,False)

# Note, with that grammar:
# SENTENCE: "etsivät etsivät etsivät etsivät etsivät etsivät etsivät"
# IS PARSED IN 17 DIFFERENT WAYS:

# (S
#   (NP (NP (N etsivät)) (PresParticiple etsivät) (NP (N etsivät)))
#   (V etsivät)
#   (NP (NP (N etsivät)) (PresParticiple etsivät) (NP (N etsivät))))
# This makes sense:
# "The detectives who seek detectives seek detectives who seek detectives."

# (S
#   (NP
#     (NP
#       (NP (N etsivät))
#       (PresParticiple etsivät)
# 
#       (NP (N etsivät)))
#     (PresParticiple etsivät)
#     (NP (N etsivät)))
#   (V etsivät)
#   (NP (N etsivät)))
# "The detectives who seek detectives investigating detectives seek detectives".

# (S
#   (NP
#     (NP (PresParticiple etsivät) (N etsivät))
#     (PresParticiple etsivät)
#     (NP (PresParticiple etsivät) (N etsivät)))
# 
#   (V etsivät)
#   (NP (N etsivät)))
# This makes sense as well:
# "The investigating detectives who seek investigating detectives seek detectives."
# Or "Investigating detectives seeking investigating detectives seek detectives."
#


binarytree1grammar = nltk.parse_cfg("""
  B -> Z | B B
  Z -> "L"
  """)

# parse_sentences(binarytree1grammar,"L",5,True)
parse_sentences(binarytree1grammar,"L",10,False)

# and we get Catalans (A000108) as we should!

# Now we convert this combstruct definition of A007477,
# i.e. ECS #441:

# [S, {S = Union(Z,Prod(Z,Union(Z,Prod(S,S))))}, unlabelled]

# by this intermediate "expanded version"
# [S, {S = Union(Z,Prod(Z,S2)), S2 = Union(Z,Prod(S,S))}, unlabelled]

# to a NLTK-grammar:

ECS441grammar = nltk.parse_cfg("""
  S -> Z | Z S2
  S2 -> Z | S S
  Z -> "L"
  """)

parse_sentences(ECS441grammar,"L",4,True)
parse_sentences(ECS441grammar,"L",10,False)

# And indeed we get:
# Sequence grows as: [1, 1, 1, 2, 3, 6, 11, 22, 44, 90]
# i.e. the initial terms of http://www.research.att.com/~njas/sequences/A007477
# Or ACTUALLY, the terms of
# http://algo.inria.fr/bin/encyclopedia?Search=ECSnb&argsearch=441
# from the offset n=1 onward.
# [1, 1, 1, 2, 3, 6, 11, 22, 44, 90, 187, 392, 832, 1778, 3831, 8304, 18104, 39666, 87296, 192896]

# So, this grammar gives us essentially three kinds of productions:
#
#
#  S                S                S
#  |               / \              / \
#  Z              Z   S2           Z  S2
#                     |              / \
#                     Z             S   S
#
# Compare with the productions of buffalo2grammar:
#   S -> NP VP
#   PN -> "buffalo"
#   N -> "buffalo"
#   V -> "buffalo"
#   NP -> N | PN N | NP RC
#   RC -> NP V
#   VP -> V NP
#
# (first for its NP):
#
#     NP           NP               NP
#     |           / \              /  \
#     N         PN   N           NP    RC
#                                     / \
#                                   NP   V
#
# All leaves (N, PN & V) are buffaloes.
# Two first productions are isomorphic with two first productions
# of ECS441grammar, and the third one has likewise one leaf
# and two further branches of similar structures.

# Now the toplevel S production of buffalo2grammar, S -> NP VP
# is expanded as:
#
#
#     S
#    / \
#  NP  VP
#     / \
#    V   NP
#
# again with one leaf (V) and two branches to NP's.
# (reflect the right-hand-side subtree, and you get binary tree
# isomorphic with the last production for NP, above).
# So, apart for structures of size 1 and 2, buffalo2grammar
# should generate the same sequence as ECS441grammar:
# [0, 0, 1, 2, 3, 6, 11, 22, 44, 90, 187, 392]
# as it indeed does.
#                                             Q.E.D.
# (Antti Karttunen, Porvoo, February 28 2008).
#
#
# Thus we have zero alternatives for "buffalo" and "buffalo buffalo",
# and then the values A007477[2] .. A007477[23]
# give the values for alternative parsings of sentences from
# buffalo^3 to buffalo^24:
#
# 1
# 2
# 3
# 6
# 11
# 22
# 44
# 90
# 187
# 392
# 832
# 1778
# 3831
# 8304
# 18104
# 39666
# 87296
# 192896
# 427778
# 951808
# 2124135
# 4753476
#

# Converted from Maple procedure definition:
#  A007477 := proc(n) option remember; local k; if n <= 1 then 1 else add(A007477(k)*A007477(n-k-2), k=0..n-2); fi; end;
#


from array import *

def gen_n_items(n,item):
  for i in xrange(n): yield(item)

def memoed(fun):
    maxmemosize = 131 # 072
    notfilled = 0L
    memo = array('L')
    def wrapper(n):
        memsiznow = len(memo)
        if None == n: return(memo) # For debugging
        elif n >= memsiznow:
              newsize = min(maxmemosize,max(n+1,2*memsiznow))
              memo.extend(gen_n_items(newsize-memsiznow,notfilled))

        res = memo[n]
        if(notfilled == res):
              res = fun(n)
              memo[n] = res
              return(res)
        else: return(res)
    return wrapper


# With memoed we get results MUCH faster:

@memoed
def A007477(n):
  '''Compute A007477(n)'''
  if(n < 2): return(1)
  else:
    s = 0
    for k in range(n-1):
      s += (A007477(k)*A007477(n-k-2))
    return(s)


sys.stdout.write("A007477[0..31] follows: " + str([A007477(n) for n in range(31)]) + "\n")