Source code for cfg

# -*- coding: utf-8 -*-
"""**Context Free Grammars Manipulation.**

Basic context-free grammars manipulation for building uniform random generetors

.. *Authors:* Rogério Reis & Nelma Moreira

.. *This is part of FAdo project* http://fado.dcc.fc.up.pt

.. *Copyright:* 1999-2014 Rogério Reis & Nelma Moreira {rvr,nam}@dcc.fc.up.pt


.. This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as published
   by the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   for more details.

   You should have received a copy of the GNU General Public License along
   with this program; if not, write to the Free Software Foundation, Inc.,
   675 Mass Ave, Cambridge, MA 02139, USA."""

#__package__ = "FAdo"

import re
import string
from types import StringType
from random import randint
import common

[docs]class CFGrammar(object):
    """ Class for context-free grammars

     :var Rules: grammar rules
     :var Terminals: terminals symbols
     :var Nonterminals: nonterminals symbols
     :var str Start: start symbol
     :var ntr: dictionary of rules for each nonterminal"""

    def __init__(self, gram):
        """Initialization

            :param gram: is a list for productions; each production is a tuple (LeftHandside, RightHandside) with
                LeftHandside nonterminal, RightHandside list of symbols, First production is for start symbol

        """
        self.Rules = gram
        self.Nonterminals = {r[0] for r in self.Rules}
        self.Terminals = set()
        for r in self.Rules:
            if type(r[1]) is StringType:
                if r[1] not in self.Nonterminals:
                    self.Terminals.add(r[1])
            else:
                for s in r[1]:
                    if s not in self.Nonterminals:
                        self.Terminals.add(s)
        self.Start = self.Rules[0][0]
        self.Nullable = {}
        self.tr = {}
        self.ntr = {}
        for i in xrange(len(self.Rules)):
            if self.Rules[i][0] not in self.ntr:
                self.ntr[self.Rules[i][0]] = {i}
            else:
                self.ntr[self.Rules[i][0]].add(i)

    def __str__(self):
        """Grammar rules

        :return: a string representing the grammar rules"""
        s = ""
        for n in xrange(len(self.Rules)):
            lhs = self.Rules[n][0]
            rhs = self.Rules[n][1]
            if type(rhs) is not StringType and len(rhs) > 1:
                rhs = string.join(rhs)
            s += "{0:d} | {1:s} -> {2:s} \n".format(n, lhs, rhs)
        return "Grammar Rules:\n\n%s" % s

[docs]    def maketerminals(self):
        """Extracts C{terminals} from the rules. Nonterminals must already exist"""
        self.Terminals = set()
        for r in self.Rules:
            if type(r[1]) is StringType:
                if r[1] not in self.Nonterminals:
                    self.Terminals.add(r[1])
            else:
                for s in r[1]:
                    if s not in self.Nonterminals:
                        self.Terminals.add(s)

[docs]    def makenonterminals(self):
        """Extracts C{nonterminals}  from grammar rules."""
        for r in self.Rules:
            self.Nonterminals.add(r[0])

    def terminalrules(self):
        self.tr = {}
        for a in self.Terminals:
            for i in xrange(len(self.Rules)):
                if self.Rules[i][1] == a:
                    if a not in self.tr:
                        self.tr[a] = {i}
                    else:
                        self.tr[a].add(i)

    def nonterminalrules(self):
        self.ntr = {}
        for i in xrange(len(self.Rules)):
            if self.Rules[i][0] not in self.ntr:
                self.ntr[self.Rules[i][0]] = {i}
            else:
                self.ntr[self.Rules[i][0]].add(i)

[docs]    def NULLABLE(self):
        """Determines which nonterminals X ->* [] """
        self.Nullable = {}
        for s in self.Terminals:
            self.Nullable[s] = 0
        for s in self.Nonterminals:
            self.Nullable[s] = 0
            if s in self.ntr:
                for i in self.ntr[s]:
                    if not self.Rules[i][1]:
                        self.Nullable[s] = 1
                        break
        k = 1
        while k == 1:
            k = 0
            for r in self.Rules:
                e = 0
                for i in r[1]:
                    if not self.Nullable[i]:
                        e = 1
                        break
                if e == 0 and not self.Nullable[r[0]]:
                    self.Nullable[r[0]] = 1
                    k = 1


[docs]class CNF(CFGrammar):
    """No useless nonterminals or epsilon rules are ALLOWED... Given a CFG grammar description generates one in CNF
      Then its possible to random generate words of a given size. Before some pre-calculations are nedded."""

    def __init__(self, gram, mark="A@"):
       # super(CNF, self).__init__(gram)
        CFGrammar.__init__(self, gram)
        self.mark = mark
        self.newnt = 0
        self.nttr = {}
        self.unitary = self.get_unitary()
        self.Chomsky()

    def get_unitary(self):
        return set([r for r in self.Rules if
                    (type(r[1]) is StringType and
                     r[1] in self.Nonterminals) or
                    (len(r[1]) == 1 and r[1][0] in self.Nonterminals)])

[docs]    def elim_unitary(self):
        """Elimination of unitary rules """
        f = 1
        while f:
            f = 0
            self.unitary = self.get_unitary()

            for u in self.unitary:
                if type(u[1]) is StringType:
                    ui = u[1]
                else:
                    ui = u[1][0]
                if ui in self.ntr:
                    for i in self.ntr[ui]:
                        if (u[0], self.Rules[i][1]) not in self.Rules:
                            f = 1
                            self.Rules.append((u[0], self.Rules[i][1]))
                            self.ntr[u[0]].add(len(self.Rules) - 1)

        for u in self.unitary:
            self.Rules.remove(u)

    def get_ntr_tr(self, a):
        nta = self.mark + a
        self.Nonterminals.add(nta)
        self.Rules.append((nta, a))
        return nta

    def iter_rule(self, lhs, rhs, i):
        if type(rhs) is not StringType and len(rhs) == 2:
            self.Rules[i] = (lhs, rhs)
            return
        nta = self.mark + "_" + str(self.newnt)
        self.Nonterminals.add(nta)
        self.newnt += 1
        self.Rules.append((lhs, (rhs[0], nta)))
        self.iter_rule(nta, rhs[1:], i)

[docs]    def Chomsky(self):
        """ Transform to CNF """
        self.elim_unitary()
        self.nttr = {}
        # terminal a is replaced by A@_a in all rules > 2
        for a in self.Terminals:
            for i in xrange(len(self.Rules)):
                if type(self.Rules[i][1]) is not StringType and len(self.Rules[i][1]) >= 2 and a in self.Rules[i][1]:
                    if a not in self.nttr:
                        self.nttr[a] = self.get_ntr_tr(a)
                    rr = list(self.Rules[i][1])
                    for k in xrange(len(rr)):
                        if rr[k] == a:
                            rr[k] = self.nttr[a]
                    self.Rules[i] = (self.Rules[i][0], tuple(rr))
        n = len(self.Rules)
        for i in xrange(n):
            if type(self.Rules[i][1]) is not StringType and len(self.Rules[i][1]) > 2:
                self.iter_rule(self.Rules[i][0], self.Rules[i][1], i)


[docs]class cfgGenerator(object):
    """CFG uniform genetaror """
    def __init__(self, cfgr, size):
        """Object initialization
            :param cfgr: grammar for the random objects
            :type cfgr: CNF
            :param size: size of objects
            :type size: integer"""
        self.grammar = cfgr
        self.size = size
 #       self.density = {}
 #       self.density_r = {}
        self._eval_densities(size)

[docs]    def generate(self):
        """Generates a new random object generated from the start symbol

            :returns: object
            :rtype: string"""
        return self._gen(self.grammar.Start, self.size)

    def _gen(self, nt, n):
        """Generates a new random object generated from the nonterminal

        :param str nt: nonterminal
        :param int n: object size
        :returns: object
        :rtype: str"""
        g = self.grammar
        if n in self.density[nt] and self.density[nt][n] > 0:
            u = randint(1, self.density[nt][n])
            r = 1
            if n == 1:
                for i in g.ntr[nt]:
                    if g.Rules[i][1] in g.Terminals:
                        r += 1
                        if r > u:
                            ic = i
                            break
                try:
                    return g.Rules[ic][1]
                except KeyError:
                    raise KeyError
            for i in g.ntr[nt]:
                if len(g.Rules[i][1]) == 2:
                    if n in self.density_r[i]:
                        r += self.density_r[i][n]
                        if r > u:
                            ic = i
                            break
            uk = randint(1, self.density_r[ic][n])
            rk = 1
            for k in xrange(1, n):
                if (k in self.density[g.Rules[ic][1][0]] and self.density[g.Rules[ic][1][0]][k] > 0 and
                            n - k in self.density[g.Rules[ic][1][1]] and self.density[g.Rules[ic][1][1]][
                        n - k] > 0):
                    rk += self.density[g.Rules[ic][1][0]][k] * self.density[g.Rules[ic][1][1]][n - k]
                    if rk > uk:
                        kk = k
                        break
            return self._gen(g.Rules[ic][1][0], kk) + self._gen(g.Rules[ic][1][1], n - kk)

    def _eval_densities(self, n):
        """Evaluates densities

        :param int n: object size"""
        g = self.grammar
        self.density = {}
        self.density_r = {}
        for nt in g.Nonterminals:
            self.density[nt] = {}
            self.density[nt][1] = 0
        g.terminalrules()
        g.nonterminalrules()
        for t in g.tr:
            for i in g.tr[t]:
                self.density[g.Rules[i][0]][1] += 1
        for l in xrange(2, n + 1):
            for nt in g.ntr:
                r = 0
                for i in g.ntr[nt]:
                    if len(g.Rules[i][1]) == 2:
                        if i not in self.density_r:
                            self.density_r[i] = {}
                        self.density_r[i][l] = sum(
                            [self.density[g.Rules[i][1][0]][k] * self.density[g.Rules[i][1][1]][l - k] for k in
                             xrange(1, l) if
                             k in self.density[g.Rules[i][1][0]] and l - k in self.density[g.Rules[i][1][1]]])
                        r += self.density_r[i][l]
                if r:
                    self.density[nt][l] = r


[docs]class reStringRGenerator(cfgGenerator):
    """Uniform random Generator for reStrings"""

    def __init__(self, Sigma=None, size=10, cfgr=None, epsilon=None, empty=None, ident="Ti"):
        """ Uniform random generator for regular expressions. Used without arguments generates an uncollapsible re
                over {a,b} with size 10. For generate an arbitary re over an alphabet of 10 symbols of size 100:
                reStringRGenerator (smallAlphabet(10),100,reGrammar["g_regular_base"])

        :param list|set Sigma: re alphabet (that will be the set of grammar terminals)
        :param int size: word size
        :param cfgr: base grammar
        :param epsilon: if not None is added to a grammar terminals
        :param empty: if not None is added to a grammar terminals

        .. note::
           the grammar can have already this symbols"""
        if cfgr is None:
            self.base = gRules(reGrammar["g_regular_uncollaps"])
        else:
            self.base = gRules(cfgr)
        if Sigma is None:
            self.Sigma = ["a", "b"]
        else:
            self.Sigma = Sigma
        for i in self.Sigma:
            self.base.append((ident, i))
        if epsilon is not None:
            self.base.append((ident, common.Epsilon))
        if empty is not None:
            self.base.append((ident, common.EmptySet))
        super(reStringRGenerator, self).__init__(CNF(self.base),size)


#noinspection PyUnusedLocal
def CYKParserTable(gramm, word):
    """Evaluates CYK parser table

      :param CNF gramm: grammar
      :param str word: word to be parsed
      :returns: the CYK table
      :rtype: list of lists of symbols"""
    pass


[docs]def gRules(rules_list, rulesym="->", rhssep=None, rulesep='|'):
    """Transforms a list of rules into  a grammar description.

      :param rules_list: is a list of rule where rule is a string  of the form: Word rulesym Word1 ... Word2 or  Word
          rulesym []
      :param rulesym: LHS and RHS rule separator
      :param rhssep: RHS values separator (None for white chars)
      :return: a grammar description """
    gr = []
    sep = re.compile(rulesym)
    #rsep = re.compile(rulesep)
    for r in rules_list:
        if type(r) is StringType:
            rule = r
        else:
            rule = r[0]
        m = sep.search(rule)
        if not m:
            continue
        else:
            if m.start() == 0:
                raise common.CFGgrammarError(rule)
            else:
                lhs = rule[0:m.start()].strip()
            if m.end() == len(rule):
                raise common.CFGgrammarError(rule)
            else:
                rest = string.strip(rule[m.end():])
                if rest == "[]":
                    rhs = []
                else:
                    multi = string.split(rest, rulesep)
                    rhs = []
                    for i in multi:
                        l = string.split(i, rhssep)
                        if len(l) > 1:
                            l = tuple(string.split(i, rhssep))
                        else:
                            l = l[0]
                        gr.append((lhs, l))
    return gr

[docs]def smallAlphabet(k, sigma_base="a"):
    """Easy way to have small alphabets

      :param k: alphabet size (must be less than 52)
      :param sigma_base: initial symbol
      :returns: alphabet
      :rtype: list"""
    Sigma = []
    if k >= 52:
        raise common.CFGterminalError(k)
    lim = min(26, k)
    for i in xrange(lim):
        Sigma.append(chr(ord(sigma_base) + i))
    if k >= 26:
        sigma_base = 'A'
        for i in xrange(k - lim):
            Sigma.append(chr(ord(sigma_base) + i))
    return Sigma

reGrammar = {'g_regular_base': ["Tr -> Tr + Tc | Tc", "Tc -> Tc Ts |  Ts",
                                "Ts -> Ts * | Ti | ( Tr ) "],
             'g_finite_base': ["Tr -> Tr + Tc | Tc", "Tc -> Tc Ts |  Ts",
                                "Ts ->  Ti | ( Tr ) "],
             'g_regular_wredund': ["Tr -> Trs | Tf ", "Trs -> Trs + Tf | Tf + Tf",
                                   "Tf -> Tc | Te | Ti",
                                   "Tc -> Tc Ts |  Ts Ts",
                                   "Ts -> Te | Ti | ( Trs ) ",
                                   "Te -> ( Trs ) * | ( Tc ) * | Ti *"],
             'g_regular_uncollaps': ["Ts ->  Trs | Tcc  | Tee | Ti | {0:s} | {1:s}".format(common.Epsilon, common.EmptySet),
                                     "Tcc -> Tcc Tr | Tr Tr",
                                     "Tr -> ( Trs ) | Tee | Ti ",
                                     "Tee -> ( Trs ) * | ( Tcc ) * | Ti *",
                                     "Trs -> {0:s} + Tx | Ty + Tz".format(common.Epsilon),
                                     "Tx -> Tt | Tt + Tx",
                                     "Tt -> Tcc | Ti ",
                                     "Ty -> Tz | Ty + Tz",
                                     "Tz -> Tcc | Tee | Ti "],
             'g_pt': [" Tz -> Tz + Tf | Tf",
                      "Tf -> Tf {0:s}  Te | Te".format(common.Conj),
                      "Te -> {0:s} ( Ts ) | Ts ".format(common.Not),
                       "Ts -> Ts Tu | {0:s} Tu".format(common.SigmaS),
                      "Tu ->  Ti {0:s}".format(common.SigmaS)],
             'g_sha': ["Ts  -> Ted | Tec | Tes | Ti | {0:s}".format(common.Epsilon),
                       "Tec -> . Tec Tr | . Tr Tr ",
                       "Tr -> Ted | Tes | Ti",
                       "Tes -> * Ted | * Tec | * Ti",
                       "Ted -> + {0:s} Tx | + Ty Tz".format(common.Epsilon),
                       "Tx -> Tt | + Tt Tx",
                       "Tt -> Tec | Ti",
                       "Ty -> Tz | + Ty Tz",
                       "Tz -> Tec | Tes | Ti"],
             'g_rpn': ["Tx -> {0:s} | Ti | +  Tx  Tx  | . Tx Tx  | * Tx".format(common.Epsilon)],
             'g_rpn_finite': ["Tx -> {0:s} | Ti | +  Tx  Tx  | . Tx Tx".format(common.Epsilon)],
             'g_rpn_snf':["Tx ->  Tp | Tnp",
                    "Tp -> {0:s} | . Tp Tp | + Tp Tnp | + Tnp Tp | + Tp Tp |  * Tnp ".format(common.Epsilon),
                    "Tnp -> Ti | + Tnp Tnp | . Tp Tnp | . Tnp Tp | . Tnp Tnp"],
             'g_rpn_snf_uf': ["Tx ->  Tp | Tnp",
                           "Tp ->  . Tp Tp |   * Tnp ",
                           "Tnp -> Ti |  . Tp Tnp | . Tnp Tp | . Tnp Tnp"],
             'g_rpn_option':["Tp -> {0:s} | Tx ".format(common.Epsilon),
                 "Tx ->  Ti | +  Tx  Tx  | . Tx Tx  | * Tx| {0:s} Tx".format(common.Option)],
             'g_rpn_snf_option':["Tx -> {0:s} | Tp | Tnp".format(common.Epsilon),
                    "Tp -> . Tp Tp | + Tp Tnp | + Tnp Tp | + Tp Tp |* Tnp| {0:s} Tnp".format(common.Option),
                    "Tnp -> Ti | + Tnp Tnp | . Tp Tnp | . Tnp Tp | . Tnp Tnp"],
             'g_rpn_pi': [ "Tp -> Ti | +  Tp  Tx | + Tnp Tp | . Tx Tp | . Tp {0:s}".format(common.Epsilon),
                           "Tx -> {0:s} | Ti | +  Tx  Tx  | . Tx Tx  | * Tx".format(common.Epsilon),
                           "Tnp -> {0:s} | +  Tnp  Tnp | . Tnp Tnp | . Tp Tnp".format(common.Epsilon)],
             'grse_rpn': ["Tx -> {0:s} | Ti | +  Tx  Tx  | {1:s} Tx Tx | . Tx Tx  | * Tx".format(common.Epsilon, common.Shuffle)],
             'grs_rpn': ["Tx ->  Ti | +  Tx  Tx  | {0:s} Tx Tx | . Tx Tx  | * Tx".format(common.Shuffle)],
             'grf_rpn': ["Tx ->  Ti | +  Tx  Tx  | {0:s} Tx Tx | . Tx Tx ".format(common.Shuffle)],
             'grce_rpn': ["Tx -> {0:s} | Ti | +  Tx  Tx  | {1:s} Tx Tx | . Tx Tx  | * Tx".format(common.Epsilon, common.Conj)],
             'grc_rpn': ["Tx ->  Ti | +  Tx  Tx  | {0:s} Tx Tx | . Tx Tx  | * Tx".format(common.Conj)],
             'grcf_rpn': ["Tx ->  Ti | +  Tx  Tx  | {0:s} Tx Tx | . Tx Tx ".format(common.Conj)],
             'grcn_rpn': ["Tx ->  Ti | +  Tx  Tx  | {0:s} Tx Tx | . Tx Tx  | {1:s} Tx | * Tx".format(common.Conj, common.Not)],
             'grche_rpn': ["Ts -> {0:s} Tx Tx".format(common.Conj), "Tx ->  {0:s} |  Ti | +  Tx  Tx  | . Tx Tx  | * Tx".format(common.Epsilon)],
             'grcv_rpn': [ "Tn -> Ti | + Tn Tx | + Tv Tn | * Tn | . Tn Tx | . Tv Tn | {0:s} Tn Tn".format(common.Conj),
                           "Tv -> {0:s} | + Tv Tv | * Tv | . Tv Tv | {1:s} Tv Tx | {1:s} Tx Tv".format(common.Epsilon, common.Conj),
                          "Tx -> {0:s} | Ti | +  Tx  Tx  | {1:s} Tx Tx | . Tx Tx  | * Tx".format(common.Epsilon, common.Conj)],
             'gpt_rpn':  [" Tz ->  + Tz  Tf | Tf",
                      "Tf -> {0:s} Tf Te | Te".format(common.Conj),
                      "Te -> {0:s} Ts | Ts ".format(common.Not),
                       "Ts -> . Ts Tu | . {0:s} Tu".format(common.SigmaS),
                          "Tu ->  . Ti {0:s}".format(common.SigmaS)],
             '2D_g_rpn': [ "Tx -> / R R | +  Tx  Tx  | . Tx Tx  | * Tx",
                            "R -> {0:s} | Ti | +  R  R  | . R R  | * R".format(common.Epsilon)]
                           
            }