Source code for cfg

# -*- coding: utf-8 -*-
"""**Context Free Grammars Manipulation.**

Basic context-free grammars manipulation for building uniform random generetors

.. *Authors:* Rogério Reis & Nelma Moreira

.. *This is part of FAdo project* http://fado.dcc.fc.up.pt

.. *Copyright:* 1999-2014 Rogério Reis & Nelma Moreira {rvr,nam}@dcc.fc.up.pt


.. This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as published
   by the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   for more details.

   You should have received a copy of the GNU General Public License along
   with this program; if not, write to the Free Software Foundation, Inc.,
   675 Mass Ave, Cambridge, MA 02139, USA."""

#__package__ = "FAdo"

import re
import string
from types import StringType
from random import randint
import common

[docs]class CFGrammar(object): """ Class for context-free grammars :var Rules: grammar rules :var Terminals: terminals symbols :var Nonterminals: nonterminals symbols :var str Start: start symbol :var ntr: dictionary of rules for each nonterminal""" def __init__(self, gram): """Initialization :param gram: is a list for productions; each production is a tuple (LeftHandside, RightHandside) with LeftHandside nonterminal, RightHandside list of symbols, First production is for start symbol """ self.Rules = gram self.Nonterminals = {r[0] for r in self.Rules} self.Terminals = set() for r in self.Rules: if type(r[1]) is StringType: if r[1] not in self.Nonterminals: self.Terminals.add(r[1]) else: for s in r[1]: if s not in self.Nonterminals: self.Terminals.add(s) self.Start = self.Rules[0][0] self.Nullable = {} self.tr = {} self.ntr = {} for i in xrange(len(self.Rules)): if self.Rules[i][0] not in self.ntr: self.ntr[self.Rules[i][0]] = {i} else: self.ntr[self.Rules[i][0]].add(i) def __str__(self): """Grammar rules :return: a string representing the grammar rules""" s = "" for n in xrange(len(self.Rules)): lhs = self.Rules[n][0] rhs = self.Rules[n][1] if type(rhs) is not StringType and len(rhs) > 1: rhs = string.join(rhs) s += "{0:d} | {1:s} -> {2:s} \n".format(n, lhs, rhs) return "Grammar Rules:\n\n%s" % s
[docs] def maketerminals(self): """Extracts C{terminals} from the rules. Nonterminals must already exist""" self.Terminals = set() for r in self.Rules: if type(r[1]) is StringType: if r[1] not in self.Nonterminals: self.Terminals.add(r[1]) else: for s in r[1]: if s not in self.Nonterminals: self.Terminals.add(s)
[docs] def makenonterminals(self): """Extracts C{nonterminals} from grammar rules.""" for r in self.Rules: self.Nonterminals.add(r[0])
def terminalrules(self): self.tr = {} for a in self.Terminals: for i in xrange(len(self.Rules)): if self.Rules[i][1] == a: if a not in self.tr: self.tr[a] = {i} else: self.tr[a].add(i) def nonterminalrules(self): self.ntr = {} for i in xrange(len(self.Rules)): if self.Rules[i][0] not in self.ntr: self.ntr[self.Rules[i][0]] = {i} else: self.ntr[self.Rules[i][0]].add(i)
[docs] def NULLABLE(self): """Determines which nonterminals X ->* [] """ self.Nullable = {} for s in self.Terminals: self.Nullable[s] = 0 for s in self.Nonterminals: self.Nullable[s] = 0 if s in self.ntr: for i in self.ntr[s]: if not self.Rules[i][1]: self.Nullable[s] = 1 break k = 1 while k == 1: k = 0 for r in self.Rules: e = 0 for i in r[1]: if not self.Nullable[i]: e = 1 break if e == 0 and not self.Nullable[r[0]]: self.Nullable[r[0]] = 1 k = 1
[docs]class CNF(CFGrammar): """No useless nonterminals or epsilon rules are ALLOWED... Given a CFG grammar description generates one in CNF Then its possible to random generate words of a given size. Before some pre-calculations are nedded.""" def __init__(self, gram, mark="A@"): # super(CNF, self).__init__(gram) CFGrammar.__init__(self, gram) self.mark = mark self.newnt = 0 self.nttr = {} self.unitary = self.get_unitary() self.Chomsky() def get_unitary(self): return set([r for r in self.Rules if (type(r[1]) is StringType and r[1] in self.Nonterminals) or (len(r[1]) == 1 and r[1][0] in self.Nonterminals)])
[docs] def elim_unitary(self): """Elimination of unitary rules """ f = 1 while f: f = 0 self.unitary = self.get_unitary() for u in self.unitary: if type(u[1]) is StringType: ui = u[1] else: ui = u[1][0] if ui in self.ntr: for i in self.ntr[ui]: if (u[0], self.Rules[i][1]) not in self.Rules: f = 1 self.Rules.append((u[0], self.Rules[i][1])) self.ntr[u[0]].add(len(self.Rules) - 1) for u in self.unitary: self.Rules.remove(u)
def get_ntr_tr(self, a): nta = self.mark + a self.Nonterminals.add(nta) self.Rules.append((nta, a)) return nta def iter_rule(self, lhs, rhs, i): if type(rhs) is not StringType and len(rhs) == 2: self.Rules[i] = (lhs, rhs) return nta = self.mark + "_" + str(self.newnt) self.Nonterminals.add(nta) self.newnt += 1 self.Rules.append((lhs, (rhs[0], nta))) self.iter_rule(nta, rhs[1:], i)
[docs] def Chomsky(self): """ Transform to CNF """ self.elim_unitary() self.nttr = {} # terminal a is replaced by A@_a in all rules > 2 for a in self.Terminals: for i in xrange(len(self.Rules)): if type(self.Rules[i][1]) is not StringType and len(self.Rules[i][1]) >= 2 and a in self.Rules[i][1]: if a not in self.nttr: self.nttr[a] = self.get_ntr_tr(a) rr = list(self.Rules[i][1]) for k in xrange(len(rr)): if rr[k] == a: rr[k] = self.nttr[a] self.Rules[i] = (self.Rules[i][0], tuple(rr)) n = len(self.Rules) for i in xrange(n): if type(self.Rules[i][1]) is not StringType and len(self.Rules[i][1]) > 2: self.iter_rule(self.Rules[i][0], self.Rules[i][1], i)
[docs]class cfgGenerator(object): """CFG uniform genetaror """ def __init__(self, cfgr, size): """Object initialization :param cfgr: grammar for the random objects :type cfgr: CNF :param size: size of objects :type size: integer""" self.grammar = cfgr self.size = size # self.density = {} # self.density_r = {} self._eval_densities(size)
[docs] def generate(self): """Generates a new random object generated from the start symbol :returns: object :rtype: string""" return self._gen(self.grammar.Start, self.size)
def _gen(self, nt, n): """Generates a new random object generated from the nonterminal :param str nt: nonterminal :param int n: object size :returns: object :rtype: str""" g = self.grammar if n in self.density[nt] and self.density[nt][n] > 0: u = randint(1, self.density[nt][n]) r = 1 if n == 1: for i in g.ntr[nt]: if g.Rules[i][1] in g.Terminals: r += 1 if r > u: ic = i break try: return g.Rules[ic][1] except KeyError: raise KeyError for i in g.ntr[nt]: if len(g.Rules[i][1]) == 2: if n in self.density_r[i]: r += self.density_r[i][n] if r > u: ic = i break uk = randint(1, self.density_r[ic][n]) rk = 1 for k in xrange(1, n): if (k in self.density[g.Rules[ic][1][0]] and self.density[g.Rules[ic][1][0]][k] > 0 and n - k in self.density[g.Rules[ic][1][1]] and self.density[g.Rules[ic][1][1]][ n - k] > 0): rk += self.density[g.Rules[ic][1][0]][k] * self.density[g.Rules[ic][1][1]][n - k] if rk > uk: kk = k break return self._gen(g.Rules[ic][1][0], kk) + self._gen(g.Rules[ic][1][1], n - kk) def _eval_densities(self, n): """Evaluates densities :param int n: object size""" g = self.grammar self.density = {} self.density_r = {} for nt in g.Nonterminals: self.density[nt] = {} self.density[nt][1] = 0 g.terminalrules() g.nonterminalrules() for t in g.tr: for i in g.tr[t]: self.density[g.Rules[i][0]][1] += 1 for l in xrange(2, n + 1): for nt in g.ntr: r = 0 for i in g.ntr[nt]: if len(g.Rules[i][1]) == 2: if i not in self.density_r: self.density_r[i] = {} self.density_r[i][l] = sum( [self.density[g.Rules[i][1][0]][k] * self.density[g.Rules[i][1][1]][l - k] for k in xrange(1, l) if k in self.density[g.Rules[i][1][0]] and l - k in self.density[g.Rules[i][1][1]]]) r += self.density_r[i][l] if r: self.density[nt][l] = r
[docs]class reStringRGenerator(cfgGenerator): """Uniform random Generator for reStrings""" def __init__(self, Sigma=None, size=10, cfgr=None, epsilon=None, empty=None, ident="Ti"): """ Uniform random generator for regular expressions. Used without arguments generates an uncollapsible re over {a,b} with size 10. For generate an arbitary re over an alphabet of 10 symbols of size 100: reStringRGenerator (smallAlphabet(10),100,reGrammar["g_regular_base"]) :param list|set Sigma: re alphabet (that will be the set of grammar terminals) :param int size: word size :param cfgr: base grammar :param epsilon: if not None is added to a grammar terminals :param empty: if not None is added to a grammar terminals .. note:: the grammar can have already this symbols""" if cfgr is None: self.base = gRules(reGrammar["g_regular_uncollaps"]) else: self.base = gRules(cfgr) if Sigma is None: self.Sigma = ["a", "b"] else: self.Sigma = Sigma for i in self.Sigma: self.base.append((ident, i)) if epsilon is not None: self.base.append((ident, common.Epsilon)) if empty is not None: self.base.append((ident, common.EmptySet)) super(reStringRGenerator, self).__init__(CNF(self.base),size)
#noinspection PyUnusedLocal def CYKParserTable(gramm, word): """Evaluates CYK parser table :param CNF gramm: grammar :param str word: word to be parsed :returns: the CYK table :rtype: list of lists of symbols""" pass
[docs]def gRules(rules_list, rulesym="->", rhssep=None, rulesep='|'): """Transforms a list of rules into a grammar description. :param rules_list: is a list of rule where rule is a string of the form: Word rulesym Word1 ... Word2 or Word rulesym [] :param rulesym: LHS and RHS rule separator :param rhssep: RHS values separator (None for white chars) :return: a grammar description """ gr = [] sep = re.compile(rulesym) #rsep = re.compile(rulesep) for r in rules_list: if type(r) is StringType: rule = r else: rule = r[0] m = sep.search(rule) if not m: continue else: if m.start() == 0: raise common.CFGgrammarError(rule) else: lhs = rule[0:m.start()].strip() if m.end() == len(rule): raise common.CFGgrammarError(rule) else: rest = string.strip(rule[m.end():]) if rest == "[]": rhs = [] else: multi = string.split(rest, rulesep) rhs = [] for i in multi: l = string.split(i, rhssep) if len(l) > 1: l = tuple(string.split(i, rhssep)) else: l = l[0] gr.append((lhs, l)) return gr
[docs]def smallAlphabet(k, sigma_base="a"): """Easy way to have small alphabets :param k: alphabet size (must be less than 52) :param sigma_base: initial symbol :returns: alphabet :rtype: list""" Sigma = [] if k >= 52: raise common.CFGterminalError(k) lim = min(26, k) for i in xrange(lim): Sigma.append(chr(ord(sigma_base) + i)) if k >= 26: sigma_base = 'A' for i in xrange(k - lim): Sigma.append(chr(ord(sigma_base) + i)) return Sigma
reGrammar = {'g_regular_base': ["Tr -> Tr + Tc | Tc", "Tc -> Tc Ts | Ts", "Ts -> Ts * | Ti | ( Tr ) "], 'g_finite_base': ["Tr -> Tr + Tc | Tc", "Tc -> Tc Ts | Ts", "Ts -> Ti | ( Tr ) "], 'g_regular_wredund': ["Tr -> Trs | Tf ", "Trs -> Trs + Tf | Tf + Tf", "Tf -> Tc | Te | Ti", "Tc -> Tc Ts | Ts Ts", "Ts -> Te | Ti | ( Trs ) ", "Te -> ( Trs ) * | ( Tc ) * | Ti *"], 'g_regular_uncollaps': ["Ts -> Trs | Tcc | Tee | Ti | {0:s} | {1:s}".format(common.Epsilon, common.EmptySet), "Tcc -> Tcc Tr | Tr Tr", "Tr -> ( Trs ) | Tee | Ti ", "Tee -> ( Trs ) * | ( Tcc ) * | Ti *", "Trs -> {0:s} + Tx | Ty + Tz".format(common.Epsilon), "Tx -> Tt | Tt + Tx", "Tt -> Tcc | Ti ", "Ty -> Tz | Ty + Tz", "Tz -> Tcc | Tee | Ti "], 'g_pt': [" Tz -> Tz + Tf | Tf", "Tf -> Tf {0:s} Te | Te".format(common.Conj), "Te -> {0:s} ( Ts ) | Ts ".format(common.Not), "Ts -> Ts Tu | {0:s} Tu".format(common.SigmaS), "Tu -> Ti {0:s}".format(common.SigmaS)], 'g_sha': ["Ts -> Ted | Tec | Tes | Ti | {0:s}".format(common.Epsilon), "Tec -> . Tec Tr | . Tr Tr ", "Tr -> Ted | Tes | Ti", "Tes -> * Ted | * Tec | * Ti", "Ted -> + {0:s} Tx | + Ty Tz".format(common.Epsilon), "Tx -> Tt | + Tt Tx", "Tt -> Tec | Ti", "Ty -> Tz | + Ty Tz", "Tz -> Tec | Tes | Ti"], 'g_rpn': ["Tx -> {0:s} | Ti | + Tx Tx | . Tx Tx | * Tx".format(common.Epsilon)], 'g_rpn_finite': ["Tx -> {0:s} | Ti | + Tx Tx | . Tx Tx".format(common.Epsilon)], 'g_rpn_snf':["Tx -> Tp | Tnp", "Tp -> {0:s} | . Tp Tp | + Tp Tnp | + Tnp Tp | + Tp Tp | * Tnp ".format(common.Epsilon), "Tnp -> Ti | + Tnp Tnp | . Tp Tnp | . Tnp Tp | . Tnp Tnp"], 'g_rpn_snf_uf': ["Tx -> Tp | Tnp", "Tp -> . Tp Tp | * Tnp ", "Tnp -> Ti | . Tp Tnp | . Tnp Tp | . Tnp Tnp"], 'g_rpn_option':["Tp -> {0:s} | Tx ".format(common.Epsilon), "Tx -> Ti | + Tx Tx | . Tx Tx | * Tx| {0:s} Tx".format(common.Option)], 'g_rpn_snf_option':["Tx -> {0:s} | Tp | Tnp".format(common.Epsilon), "Tp -> . Tp Tp | + Tp Tnp | + Tnp Tp | + Tp Tp |* Tnp| {0:s} Tnp".format(common.Option), "Tnp -> Ti | + Tnp Tnp | . Tp Tnp | . Tnp Tp | . Tnp Tnp"], 'g_rpn_pi': [ "Tp -> Ti | + Tp Tx | + Tnp Tp | . Tx Tp | . Tp {0:s}".format(common.Epsilon), "Tx -> {0:s} | Ti | + Tx Tx | . Tx Tx | * Tx".format(common.Epsilon), "Tnp -> {0:s} | + Tnp Tnp | . Tnp Tnp | . Tp Tnp".format(common.Epsilon)], 'grse_rpn': ["Tx -> {0:s} | Ti | + Tx Tx | {1:s} Tx Tx | . Tx Tx | * Tx".format(common.Epsilon, common.Shuffle)], 'grs_rpn': ["Tx -> Ti | + Tx Tx | {0:s} Tx Tx | . Tx Tx | * Tx".format(common.Shuffle)], 'grf_rpn': ["Tx -> Ti | + Tx Tx | {0:s} Tx Tx | . Tx Tx ".format(common.Shuffle)], 'grce_rpn': ["Tx -> {0:s} | Ti | + Tx Tx | {1:s} Tx Tx | . Tx Tx | * Tx".format(common.Epsilon, common.Conj)], 'grc_rpn': ["Tx -> Ti | + Tx Tx | {0:s} Tx Tx | . Tx Tx | * Tx".format(common.Conj)], 'grcf_rpn': ["Tx -> Ti | + Tx Tx | {0:s} Tx Tx | . Tx Tx ".format(common.Conj)], 'grcn_rpn': ["Tx -> Ti | + Tx Tx | {0:s} Tx Tx | . Tx Tx | {1:s} Tx | * Tx".format(common.Conj, common.Not)], 'grche_rpn': ["Ts -> {0:s} Tx Tx".format(common.Conj), "Tx -> {0:s} | Ti | + Tx Tx | . Tx Tx | * Tx".format(common.Epsilon)], 'grcv_rpn': [ "Tn -> Ti | + Tn Tx | + Tv Tn | * Tn | . Tn Tx | . Tv Tn | {0:s} Tn Tn".format(common.Conj), "Tv -> {0:s} | + Tv Tv | * Tv | . Tv Tv | {1:s} Tv Tx | {1:s} Tx Tv".format(common.Epsilon, common.Conj), "Tx -> {0:s} | Ti | + Tx Tx | {1:s} Tx Tx | . Tx Tx | * Tx".format(common.Epsilon, common.Conj)], 'gpt_rpn': [" Tz -> + Tz Tf | Tf", "Tf -> {0:s} Tf Te | Te".format(common.Conj), "Te -> {0:s} Ts | Ts ".format(common.Not), "Ts -> . Ts Tu | . {0:s} Tu".format(common.SigmaS), "Tu -> . Ti {0:s}".format(common.SigmaS)], '2D_g_rpn': [ "Tx -> / R R | + Tx Tx | . Tx Tx | * Tx", "R -> {0:s} | Ti | + R R | . R R | * R".format(common.Epsilon)] }