YAP 7.1.0
regcomp.c
Go to the documentation of this file.
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
38 */
39
48#if defined(LIBC_SCCS) && !defined(lint)
49static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
50#endif /* LIBC_SCCS and not lint */
51
52#include "YapInterface.h"
53
54#if HAVE_SYS_TYPES_H
55#include <sys/types.h>
56#endif
57#include <stdio.h>
58#include <stdlib.h>
59#if HAVE_STRING_H
60#include <string.h>
61#endif
62#if HAVE_CTYPE_H
63#include <ctype.h>
64#endif
65#if HAVE_LIMITS_H
66#include <limits.h>
67#endif
68#if HAVE_MALLOC_H
69#include <malloc.h>
70#endif
71#include "yapregex.h"
72
73#include "collate.h"
74
75#include "regex2.h"
76#include "utils.h"
77
78#include "cclass.h"
79#include "cname.h"
80
81int collate_load_error;
82int collate_substitute_nontrivial;
83char collate_version[STR_LEN];
84unsigned char collate_substitute_table[UCHAR_MAX + 1][STR_LEN];
85struct collate_st_char_pri collate_char_pri_table[UCHAR_MAX + 1];
86struct collate_st_chain_pri collate_chain_pri_table[TABLE_SIZE];
87
88#if __WINDOWS__
89#define isblank(X) isspace(X)
90#ifndef isascii
91#define isascii(X) ((unsigned int)(X) < 0177)
92#endif
93#endif
94
95/*
96 * Compare two characters converting collate information
97 * into ASCII-compatible range, it allows to handle
98 * "[a-z]"-type ranges with national characters.
99 */
100static int collate_range_cmp(int c1, int c2) {
101 static char s1[2], s2[2];
102 int ret;
103#ifndef ASCII_COMPATIBLE_COLLATE
104 int as1, as2, al1, al2;
105#endif
106
107 c1 &= UCHAR_MAX;
108 c2 &= UCHAR_MAX;
109 if (c1 == c2)
110 return (0);
111
112#ifndef ASCII_COMPATIBLE_COLLATE
113 as1 = isascii(c1);
114 as2 = isascii(c2);
115 al1 = isalpha(c1);
116 al2 = isalpha(c2);
117
118 if (as1 || as2 || al1 || al2) {
119 if ((as1 && as2) || (!al1 && !al2))
120 return (c1 - c2);
121 if (al1 && !al2) {
122 if (isupper(c1))
123 return ('A' - c2);
124 else
125 return ('a' - c2);
126 } else if (al2 && !al1) {
127 if (isupper(c2))
128 return (c1 - 'A');
129 else
130 return (c1 - 'a');
131 }
132 }
133#endif
134 s1[0] = c1;
135 s2[0] = c2;
136 if ((ret = strcoll(s1, s2)) != 0)
137 return (ret);
138 return (c1 - c2);
139}
140
141/*
142 * parse structure, passed up and down to avoid global variables and
143 * other clumsinesses
144 */
145struct parse {
146 char *next; /* next character in RE */
147 char *end; /* end of string (-> NUL normally) */
148 int error; /* has an error been seen? */
149 sop *strip; /* malloced strip */
150 sopno ssize; /* malloced strip size (allocated) */
151 sopno slen; /* malloced strip length (used) */
152 int ncsalloc; /* number of csets allocated */
153 struct re_guts *g;
154#define NPAREN 10 /* we need to remember () 1-9 for back refs */
155 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
156 sopno pend[NPAREN]; /* -> ) ([0] unused) */
157};
158
159/* ========= begin header generated by ./mkh ========= */
160#ifdef __cplusplus
161extern "C" {
162#endif
163
164/* === regcomp.c === */
165static void p_ere(struct parse *p, int stop);
166static void p_ere_exp(struct parse *p);
167static void p_str(struct parse *p);
168static void p_bre(struct parse *p, int end1, int end2);
169static int p_simp_re(struct parse *p, int starordinary);
170static int p_count(struct parse *p);
171static void p_bracket(struct parse *p);
172static void p_b_term(struct parse *p, cset *cs);
173static void p_b_cclass(struct parse *p, cset *cs);
174static void p_b_eclass(struct parse *p, cset *cs);
175static char p_b_symbol(struct parse *p);
176static char p_b_coll_elem(struct parse *p, int endc);
177static char othercase(int ch);
178static void bothcases(struct parse *p, int ch);
179static void ordinary(struct parse *p, int ch);
180static void nonnewline(struct parse *p);
181static void repeat(struct parse *p, sopno start, int from, int to);
182static int seterr(struct parse *p, int e);
183static cset *allocset(struct parse *p);
184static void freeset(struct parse *p, cset *cs);
185static int freezeset(struct parse *p, cset *cs);
186static int firstch(struct parse *p, cset *cs);
187static int nch(struct parse *p, cset *cs);
188#if 0
189static void mcadd(struct parse *p, cset *cs, char *cp);
190#endif
191#if used
192static void mcsub(cset *cs, char *cp);
193static int mcin(cset *cs, char *cp);
194static char *mcfind(cset *cs, char *cp);
195#endif
196static void mcinvert(struct parse *p, cset *cs);
197static void mccase(struct parse *p, cset *cs);
198static int isinsets(struct re_guts *g, int c);
199static int samesets(struct re_guts *g, int c1, int c2);
200static void categorize(struct parse *p, struct re_guts *g);
201static sopno dupl(struct parse *p, sopno start, sopno finish);
202static void doemit(struct parse *p, sop op, size_t opnd);
203static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
204static void dofwd(struct parse *p, sopno pos, sop value);
205static void enlarge(struct parse *p, sopno size);
206static void stripsnug(struct parse *p, struct re_guts *g);
207static void findmust(struct parse *p, struct re_guts *g);
208static sopno pluscount(struct parse *p, struct re_guts *g);
209
210#ifdef __cplusplus
211}
212#endif
213/* ========= end header generated by ./mkh ========= */
214
215static char nuls[10]; /* place to point scanner in event of error */
216
217/*
218 * macros for use with parse structure
219 * BEWARE: these know that the parse structure is named `p' !!!
220 */
221#define PEEK() (*p->next)
222#define PEEK2() (*(p->next + 1))
223#define MORE() (p->next < p->end)
224#define MORE2() (p->next + 1 < p->end)
225#define SEE(c) (MORE() && PEEK() == (c))
226#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
227#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
228#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
229#define NEXT() (p->next++)
230#define NEXT2() (p->next += 2)
231#define NEXTn(n) (p->next += (n))
232#define GETNEXT() (*p->next++)
233#define SETERROR(e) seterr(p, (e))
234#define REQUIRE(co, e) ((co) || SETERROR(e))
235#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
236#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
237#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
238#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
239#define INSERT(op, pos) doinsert(p, (sop)(op), HERE() - (pos) + 1, pos)
240#define AHEAD(pos) dofwd(p, pos, HERE() - (pos))
241#define ASTERN(sop, pos) EMIT(sop, HERE() - pos)
242#define HERE() (p->slen)
243#define THERE() (p->slen - 1)
244#define THERETHERE() (p->slen - 2)
245#define DROP(n) (p->slen -= (n))
246
247#ifndef NDEBUG
248static int never = 0; /* for use in asserts; shuts lint up */
249#else
250#define never 0 /* some <assert.h>s have bugs too */
251#endif
252
253/*
254 - regcomp - interface for parser and compilation
255 = extern int regcomp(regex_t *, const char *, int);
256 = #define REG_BASIC 0000
257 = #define REG_EXTENDED 0001
258 = #define REG_ICASE 0002
259 = #define REG_NOSUB 0004
260 = #define REG_NEWLINE 0010
261 = #define REG_NOSPEC 0020
262 = #define REG_PEND 0040
263 = #define REG_DUMP 0200
264 */
265int /* 0 success, otherwise REG_something */
266 yap_regcomp(preg, pattern, cflags) regex_t *preg;
267const char *pattern;
268int cflags;
269{
270 struct parse pa;
271 register struct re_guts *g;
272 register struct parse *p = &pa;
273 register int i;
274 register size_t len;
275#ifdef REDEBUG
276#define GOODFLAGS(f) (f)
277#else
278#define GOODFLAGS(f) ((f) & ~REG_DUMP)
279#endif
280
281 cflags = GOODFLAGS(cflags);
282 if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC))
283 return (REG_INVARG);
284
285 if (cflags & REG_PEND) {
286 if (preg->re_endp < pattern)
287 return (REG_INVARG);
288 len = preg->re_endp - pattern;
289 } else
290 len = strlen((char *)pattern);
291
292 /* do the mallocs early so failure handling is easy */
293 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
294 (NC - 1) * sizeof(cat_t));
295 if (g == NULL)
296 return (REG_ESPACE);
297 p->ssize = len / (size_t)2 * (size_t)3 + (size_t)1; /* ugh */
298 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
299 p->slen = 0;
300 if (p->strip == NULL) {
301 free((char *)g);
302 return (REG_ESPACE);
303 }
304
305 /* set things up */
306 p->g = g;
307 p->next = (char *)pattern; /* convenience; we do not modify it */
308 p->end = p->next + len;
309 p->error = 0;
310 p->ncsalloc = 0;
311 for (i = 0; i < NPAREN; i++) {
312 p->pbegin[i] = 0;
313 p->pend[i] = 0;
314 }
315 g->csetsize = NC;
316 g->sets = NULL;
317 g->setbits = NULL;
318 g->ncsets = 0;
319 g->cflags = cflags;
320 g->iflags = 0;
321 g->nbol = 0;
322 g->neol = 0;
323 g->must = NULL;
324 g->mlen = 0;
325 g->nsub = 0;
326 g->ncategories = 1; /* category 0 is "everything else" */
327 g->categories = &g->catspace[-(CHAR_MIN)];
328 (void)memset((char *)g->catspace, 0, NC * sizeof(cat_t));
329 g->backrefs = 0;
330
331 /* do it */
332 EMIT(OEND, 0);
333 g->firststate = THERE();
334 if (cflags & REG_EXTENDED)
335 p_ere(p, OUT);
336 else if (cflags & REG_NOSPEC)
337 p_str(p);
338 else
339 p_bre(p, OUT, OUT);
340 EMIT(OEND, 0);
341 g->laststate = THERE();
342
343 /* tidy up loose ends and fill things in */
344 categorize(p, g);
345 stripsnug(p, g);
346 findmust(p, g);
347 g->nplus = pluscount(p, g);
348 g->magic = MAGIC2;
349 preg->re_nsub = g->nsub;
350 preg->re_g = g;
351 preg->re_magic = MAGIC1;
352#ifndef REDEBUG
353 /* not debugging, so can't rely on the assert() in regexec() */
354 if (g->iflags & BAD)
355 SETERROR(REG_ASSERT);
356#endif
357
358 /* win or lose, we're done */
359 if (p->error != 0) /* lose */
360 yap_regfree(preg);
361 return (p->error);
362}
363
364/*
365 - p_ere - ERE parser top level, concatenation and alternation
366 == static void p_ere(register struct parse *p, int stop);
367 */
368static void p_ere(p, stop) register struct parse *p;
369int stop; /* character this ERE should end at */
370{
371 register char c;
372 register sopno prevback = 0;
373 register sopno prevfwd = 0;
374 register sopno conc;
375 register int first = 1; /* is this the first alternative? */
376
377 for (;;) {
378 /* do a bunch of concatenated expressions */
379 conc = HERE();
380 while (MORE() && (c = PEEK()) != '|' && c != stop)
381 p_ere_exp(p);
382 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
383
384 if (!EAT('|'))
385 break; /* NOTE BREAK OUT */
386
387 if (first) {
388 INSERT(OCH_, conc); /* offset is wrong */
389 prevfwd = conc;
390 prevback = conc;
391 first = 0;
392 }
393 ASTERN(OOR1, prevback);
394 prevback = THERE();
395 AHEAD(prevfwd); /* fix previous offset */
396 prevfwd = HERE();
397 EMIT(OOR2, 0); /* offset is very wrong */
398 }
399
400 if (!first) { /* tail-end fixups */
401 AHEAD(prevfwd);
402 ASTERN(O_CH, prevback);
403 }
404
405 assert(!MORE() || SEE(stop));
406}
407
408/*
409 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
410 == static void p_ere_exp(register struct parse *p);
411 */
412static void p_ere_exp(p) register struct parse *p;
413{
414 register char c;
415 register sopno pos;
416 register int count;
417 register int count2;
418 register sopno subno;
419 int wascaret = 0;
420
421 assert(MORE()); /* caller should have ensured this */
422 c = GETNEXT();
423
424 pos = HERE();
425 switch (c) {
426 case '(':
427 (void)REQUIRE(MORE(), REG_EPAREN);
428 p->g->nsub++;
429 subno = p->g->nsub;
430 if (subno < NPAREN)
431 p->pbegin[subno] = HERE();
432 EMIT(OLPAREN, subno);
433 if (!SEE(')'))
434 p_ere(p, ')');
435 if (subno < NPAREN) {
436 p->pend[subno] = HERE();
437 assert(p->pend[subno] != 0);
438 }
439 EMIT(ORPAREN, subno);
440 (void)MUSTEAT(')', REG_EPAREN);
441 break;
442#ifndef POSIX_MISTAKE
443 case ')': /* happens only if no current unmatched ( */
444 /*
445 * You may ask, why the ifndef? Because I didn't notice
446 * this until slightly too late for 1003.2, and none of the
447 * other 1003.2 regular-expression reviewers noticed it at
448 * all. So an unmatched ) is legal POSIX, at least until
449 * we can get it fixed.
450 */
451 SETERROR(REG_EPAREN);
452 break;
453#endif
454 case '^':
455 EMIT(OBOL, 0);
456 p->g->iflags |= USEBOL;
457 p->g->nbol++;
458 wascaret = 1;
459 break;
460 case '$':
461 EMIT(OEOL, 0);
462 p->g->iflags |= USEEOL;
463 p->g->neol++;
464 break;
465 case '|':
466 SETERROR(REG_EMPTY);
467 break;
468 case '*':
469 case '+':
470 case '?':
471 SETERROR(REG_BADRPT);
472 break;
473 case '.':
474 if (p->g->cflags & REG_NEWLINE)
475 nonnewline(p);
476 else
477 EMIT(OANY, 0);
478 break;
479 case '[':
480 p_bracket(p);
481 break;
482 case '\\':
483 (void)REQUIRE(MORE(), REG_EESCAPE);
484 c = GETNEXT();
485 ordinary(p, c);
486 break;
487 case '{': /* okay as ordinary except if digit follows */
488 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
489 /* FALLTHROUGH */
490 default:
491 ordinary(p, c);
492 break;
493 }
494
495 if (!MORE())
496 return;
497 c = PEEK();
498 /* we call { a repetition if followed by a digit */
499 if (!(c == '*' || c == '+' || c == '?' ||
500 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
501 return; /* no repetition, we're done */
502 NEXT();
503
504 (void)REQUIRE(!wascaret, REG_BADRPT);
505 switch (c) {
506 case '*': /* implemented as +? */
507 /* this case does not require the (y|) trick, noKLUDGE */
508 INSERT(OPLUS_, pos);
509 ASTERN(O_PLUS, pos);
510 INSERT(OQUEST_, pos);
511 ASTERN(O_QUEST, pos);
512 break;
513 case '+':
514 INSERT(OPLUS_, pos);
515 ASTERN(O_PLUS, pos);
516 break;
517 case '?':
518 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
519 INSERT(OCH_, pos); /* offset slightly wrong */
520 ASTERN(OOR1, pos); /* this one's right */
521 AHEAD(pos); /* fix the OCH_ */
522 EMIT(OOR2, 0); /* offset very wrong... */
523 AHEAD(THERE()); /* ...so fix it */
524 ASTERN(O_CH, THERETHERE());
525 break;
526 case '{':
527 count = p_count(p);
528 if (EAT(',')) {
529 if (isdigit((uch)PEEK())) {
530 count2 = p_count(p);
531 (void)REQUIRE(count <= count2, REG_BADBR);
532 } else /* single number with comma */
533 count2 = INFINITY;
534 } else /* just a single number */
535 count2 = count;
536 repeat(p, pos, count, count2);
537 if (!EAT('}')) { /* error heuristics */
538 while (MORE() && PEEK() != '}')
539 NEXT();
540 (void)REQUIRE(MORE(), REG_EBRACE);
541 SETERROR(REG_BADBR);
542 }
543 break;
544 }
545
546 if (!MORE())
547 return;
548 c = PEEK();
549 if (!(c == '*' || c == '+' || c == '?' ||
550 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
551 return;
552 SETERROR(REG_BADRPT);
553}
554
555/*
556 - p_str - string (no metacharacters) "parser"
557 == static void p_str(register struct parse *p);
558 */
559static void p_str(p) register struct parse *p;
560{
561 (void)REQUIRE(MORE(), REG_EMPTY);
562 while (MORE())
563 ordinary(p, GETNEXT());
564}
565
566/*
567 - p_bre - BRE parser top level, anchoring and concatenation
568 == static void p_bre(register struct parse *p, register int end1, \
569 == register int end2);
570 * Giving end1 as OUT essentially eliminates the end1/end2 check.
571 *
572 * This implementation is a bit of a kludge, in that a trailing $ is first
573 * taken as an ordinary character and then revised to be an anchor. The
574 * only undesirable side effect is that '$' gets included as a character
575 * category in such cases. This is fairly harmless; not worth fixing.
576 * The amount of lookahead needed to avoid this kludge is excessive.
577 */
578static void p_bre(p, end1, end2) register struct parse *p;
579register int end1; /* first terminating character */
580register int end2; /* second terminating character */
581{
582 register sopno start = HERE();
583 register int first = 1; /* first subexpression? */
584 register int wasdollar = 0;
585
586 if (EAT('^')) {
587 EMIT(OBOL, 0);
588 p->g->iflags |= USEBOL;
589 p->g->nbol++;
590 }
591 while (MORE() && !SEETWO(end1, end2)) {
592 wasdollar = p_simp_re(p, first);
593 first = 0;
594 }
595 if (wasdollar) { /* oops, that was a trailing anchor */
596 DROP(1);
597 EMIT(OEOL, 0);
598 p->g->iflags |= USEEOL;
599 p->g->neol++;
600 }
601
602 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
603}
604
605/*
606 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
607 == static int p_simp_re(register struct parse *p, int starordinary);
608 */
609static int /* was the simple RE an unbackslashed $? */
610 p_simp_re(p, starordinary) register struct parse *p;
611int starordinary; /* is a leading * an ordinary character? */
612{
613 register int c;
614 register int count;
615 register int count2;
616 register sopno pos;
617 register int i;
618 register sopno subno;
619#define BACKSL (1 << CHAR_BIT)
620
621 pos = HERE(); /* repetion op, if any, covers from here */
622
623 assert(MORE()); /* caller should have ensured this */
624 c = GETNEXT();
625 if (c == '\\') {
626 (void)REQUIRE(MORE(), REG_EESCAPE);
627 c = BACKSL | GETNEXT();
628 }
629 switch (c) {
630 case '.':
631 if (p->g->cflags & REG_NEWLINE)
632 nonnewline(p);
633 else
634 EMIT(OANY, 0);
635 break;
636 case '[':
637 p_bracket(p);
638 break;
639 case BACKSL | '{':
640 SETERROR(REG_BADRPT);
641 break;
642 case BACKSL | '(':
643 p->g->nsub++;
644 subno = p->g->nsub;
645 if (subno < NPAREN)
646 p->pbegin[subno] = HERE();
647 EMIT(OLPAREN, subno);
648 /* the MORE here is an error heuristic */
649 if (MORE() && !SEETWO('\\', ')'))
650 p_bre(p, '\\', ')');
651 if (subno < NPAREN) {
652 p->pend[subno] = HERE();
653 assert(p->pend[subno] != 0);
654 }
655 EMIT(ORPAREN, subno);
656 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
657 break;
658 case BACKSL | ')': /* should not get here -- must be user */
659 case BACKSL | '}':
660 SETERROR(REG_EPAREN);
661 break;
662 case BACKSL | '1':
663 case BACKSL | '2':
664 case BACKSL | '3':
665 case BACKSL | '4':
666 case BACKSL | '5':
667 case BACKSL | '6':
668 case BACKSL | '7':
669 case BACKSL | '8':
670 case BACKSL | '9':
671 i = (c & ~BACKSL) - '0';
672 assert(i < NPAREN);
673 if (p->pend[i] != 0) {
674 assert(i <= p->g->nsub);
675 EMIT(OBACK_, i);
676 assert(p->pbegin[i] != 0);
677 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
678 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
679 (void)dupl(p, p->pbegin[i] + 1, p->pend[i]);
680 EMIT(O_BACK, i);
681 } else
682 SETERROR(REG_ESUBREG);
683 p->g->backrefs = 1;
684 break;
685 case '*':
686 (void)REQUIRE(starordinary, REG_BADRPT);
687 /* FALLTHROUGH */
688 default:
689 ordinary(p, (char)c);
690 break;
691 }
692
693 if (EAT('*')) { /* implemented as +? */
694 /* this case does not require the (y|) trick, noKLUDGE */
695 INSERT(OPLUS_, pos);
696 ASTERN(O_PLUS, pos);
697 INSERT(OQUEST_, pos);
698 ASTERN(O_QUEST, pos);
699 } else if (EATTWO('\\', '{')) {
700 count = p_count(p);
701 if (EAT(',')) {
702 if (MORE() && isdigit((uch)PEEK())) {
703 count2 = p_count(p);
704 (void)REQUIRE(count <= count2, REG_BADBR);
705 } else /* single number with comma */
706 count2 = INFINITY;
707 } else /* just a single number */
708 count2 = count;
709 repeat(p, pos, count, count2);
710 if (!EATTWO('\\', '}')) { /* error heuristics */
711 while (MORE() && !SEETWO('\\', '}'))
712 NEXT();
713 (void)REQUIRE(MORE(), REG_EBRACE);
714 SETERROR(REG_BADBR);
715 }
716 } else if (c == '$') /* $ (but not \$) ends it */
717 return (1);
718
719 return (0);
720}
721
722/*
723 - p_count - parse a repetition count
724 == static int p_count(register struct parse *p);
725 */
726static int /* the value */
727 p_count(p) register struct parse *p;
728{
729 register int count = 0;
730 register int ndigits = 0;
731
732 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
733 count = count * 10 + (GETNEXT() - '0');
734 ndigits++;
735 }
736
737 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
738 return (count);
739}
740
741/*
742 - p_bracket - parse a bracketed character list
743 == static void p_bracket(register struct parse *p);
744 *
745 * Note a significant property of this code: if the allocset() did SETERROR,
746 * no set operations are done.
747 */
748static void p_bracket(p) register struct parse *p;
749{
750 register cset *cs = allocset(p);
751 register int invert = 0;
752
753 /* Dept of Truly Sickening Special-Case Kludges */
754 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
755 EMIT(OBOW, 0);
756 NEXTn(6);
757 return;
758 }
759 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
760 EMIT(OEOW, 0);
761 NEXTn(6);
762 return;
763 }
764
765 if (EAT('^'))
766 invert++; /* make note to invert set at end */
767 if (EAT(']'))
768 CHadd(cs, ']');
769 else if (EAT('-'))
770 CHadd(cs, '-');
771 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
772 p_b_term(p, cs);
773 if (EAT('-'))
774 CHadd(cs, '-');
775 (void)MUSTEAT(']', REG_EBRACK);
776
777 if (p->error != 0) /* don't mess things up further */
778 return;
779
780 if (p->g->cflags & REG_ICASE) {
781 register int i;
782 register int ci;
783
784 for (i = p->g->csetsize - 1; i >= 0; i--)
785 if (CHIN(cs, i) && isalpha(i)) {
786 ci = othercase(i);
787 if (ci != i)
788 CHadd(cs, ci);
789 }
790 if (cs->multis != NULL)
791 mccase(p, cs);
792 }
793 if (invert) {
794 register int i;
795
796 for (i = p->g->csetsize - 1; i >= 0; i--)
797 if (CHIN(cs, i))
798 CHsub(cs, i);
799 else
800 CHadd(cs, i);
801 if (p->g->cflags & REG_NEWLINE)
802 CHsub(cs, '\n');
803 if (cs->multis != NULL)
804 mcinvert(p, cs);
805 }
806
807 assert(cs->multis == NULL); /* xxx */
808
809 if (nch(p, cs) == 1) { /* optimize singleton sets */
810 ordinary(p, firstch(p, cs));
811 freeset(p, cs);
812 } else
813 EMIT(OANYOF, freezeset(p, cs));
814}
815
816/*
817 - p_b_term - parse one term of a bracketed character list
818 == static void p_b_term(register struct parse *p, register cset *cs);
819 */
820static void p_b_term(p, cs) register struct parse *p;
821register cset *cs;
822{
823 register char c;
824 register char start, finish;
825 register int i;
826
827 /* classify what we've got */
828 switch ((MORE()) ? PEEK() : '\0') {
829 case '[':
830 c = (MORE2()) ? PEEK2() : '\0';
831 break;
832 case '-':
833 SETERROR(REG_ERANGE);
834 return; /* NOTE RETURN */
835 break;
836 default:
837 c = '\0';
838 break;
839 }
840
841 switch (c) {
842 case ':': /* character class */
843 NEXT2();
844 (void)REQUIRE(MORE(), REG_EBRACK);
845 c = PEEK();
846 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
847 p_b_cclass(p, cs);
848 (void)REQUIRE(MORE(), REG_EBRACK);
849 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
850 break;
851 case '=': /* equivalence class */
852 NEXT2();
853 (void)REQUIRE(MORE(), REG_EBRACK);
854 c = PEEK();
855 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
856 p_b_eclass(p, cs);
857 (void)REQUIRE(MORE(), REG_EBRACK);
858 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
859 break;
860 default: /* symbol, ordinary character, or range */
861 /* xxx revision needed for multichar stuff */
862 start = p_b_symbol(p);
863 if (SEE('-') && MORE2() && PEEK2() != ']') {
864 /* range */
865 NEXT();
866 if (EAT('-'))
867 finish = '-';
868 else
869 finish = p_b_symbol(p);
870 } else
871 finish = start;
872 if (start == finish)
873 CHadd(cs, start);
874 else {
875 if (collate_load_error) {
876 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
877 for (i = (uch)start; i <= (uch)finish; i++)
878 CHadd(cs, i);
879 } else {
880 (void)REQUIRE(collate_range_cmp(start, finish) <= 0, REG_ERANGE);
881 for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
882 if (collate_range_cmp(start, i) <= 0 &&
883 collate_range_cmp(i, finish) <= 0)
884 CHadd(cs, i);
885 }
886 }
887 }
888 break;
889 }
890}
891
892/*
893 - p_b_cclass - parse a character-class name and deal with it
894 == static void p_b_cclass(register struct parse *p, register cset *cs);
895 */
896static void p_b_cclass(p, cs) register struct parse *p;
897register cset *cs;
898{
899 register int c;
900 register char *sp = p->next;
901 register struct cclass *cp;
902 register size_t len;
903
904 while (MORE() && isalpha((uch)PEEK()))
905 NEXT();
906 len = p->next - sp;
907 for (cp = cclasses; cp->name != NULL; cp++)
908 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
909 break;
910 if (cp->name == NULL) {
911 /* oops, didn't find it */
912 SETERROR(REG_ECTYPE);
913 return;
914 }
915
916 switch (cp->fidx) {
917 case CALNUM:
918 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
919 if (isalpha((uch)c) || isdigit((uch)c))
920 CHadd(cs, c);
921 break;
922 case CALPHA:
923 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
924 if (isalpha((uch)c))
925 CHadd(cs, c);
926 break;
927 case CBLANK:
928 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
929 if (((uch)c) == ' ' || ((uch)c) == '\t')
930 CHadd(cs, c);
931 break;
932 case CCNTRL:
933 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
934 if (iscntrl((uch)c))
935 CHadd(cs, c);
936 break;
937 case CDIGIT:
938 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
939 if (isdigit((uch)c))
940 CHadd(cs, c);
941 break;
942 case CGRAPH:
943 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
944 if (isgraph((uch)c))
945 CHadd(cs, c);
946 break;
947 case CLOWER:
948 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
949 if (islower((uch)c))
950 CHadd(cs, c);
951 break;
952 case CPRINT:
953 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
954 if (isprint((uch)c))
955 CHadd(cs, c);
956 break;
957 case CPUNCT:
958 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
959 if (ispunct((uch)c))
960 CHadd(cs, c);
961 break;
962 case CSPACE:
963 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
964 if (isspace((uch)c))
965 CHadd(cs, c);
966 break;
967 case CUPPER:
968 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
969 if (isupper((uch)c))
970 CHadd(cs, c);
971 break;
972 case CXDIGIT:
973 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
974 if (isxdigit((uch)c))
975 CHadd(cs, c);
976 break;
977 }
978#if 0
979 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
980 MCadd(p, cs, u);
981#endif
982}
983
984/*
985 - p_b_eclass - parse an equivalence-class name and deal with it
986 == static void p_b_eclass(register struct parse *p, register cset *cs);
987 *
988 * This implementation is incomplete. xxx
989 */
990static void p_b_eclass(p, cs) register struct parse *p;
991register cset *cs;
992{
993 register char c;
994
995 c = p_b_coll_elem(p, '=');
996 CHadd(cs, c);
997}
998
999/*
1000 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1001 == static char p_b_symbol(register struct parse *p);
1002 */
1003static char /* value of symbol */
1004 p_b_symbol(p) register struct parse *p;
1005{
1006 register char value;
1007
1008 (void)REQUIRE(MORE(), REG_EBRACK);
1009 if (!EATTWO('[', '.'))
1010 return (GETNEXT());
1011
1012 /* collating symbol */
1013 value = p_b_coll_elem(p, '.');
1014 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1015 return (value);
1016}
1017
1018/*
1019 - p_b_coll_elem - parse a collating-element name and look it up
1020 == static char p_b_coll_elem(register struct parse *p, int endc);
1021 */
1022static char /* value of collating element */
1023 p_b_coll_elem(p, endc) register struct parse *p;
1024int endc; /* name ended by endc,']' */
1025{
1026 register char *sp = p->next;
1027 register struct cname *cp;
1028 register int len;
1029
1030 while (MORE() && !SEETWO(endc, ']'))
1031 NEXT();
1032 if (!MORE()) {
1033 SETERROR(REG_EBRACK);
1034 return (0);
1035 }
1036 len = p->next - sp;
1037 for (cp = cnames; cp->name != NULL; cp++)
1038 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1039 return (cp->code); /* known name */
1040 if (len == 1)
1041 return (*sp); /* single character */
1042 SETERROR(REG_ECOLLATE); /* neither */
1043 return (0);
1044}
1045
1046/*
1047 - othercase - return the case counterpart of an alphabetic
1048 == static char othercase(int ch);
1049 */
1050static char /* if no counterpart, return ch */
1051 othercase(ch) int ch;
1052{
1053 ch = (uch)ch;
1054 assert(isalpha(ch));
1055 if (isupper(ch))
1056 return (tolower(ch));
1057 else if (islower(ch))
1058 return (toupper(ch));
1059 else /* peculiar, but could happen */
1060 return (ch);
1061}
1062
1063/*
1064 - bothcases - emit a dualcase version of a two-case character
1065 == static void bothcases(register struct parse *p, int ch);
1066 *
1067 * Boy, is this implementation ever a kludge...
1068 */
1069static void bothcases(p, ch) register struct parse *p;
1070int ch;
1071{
1072 register char *oldnext = p->next;
1073 register char *oldend = p->end;
1074 char bracket[3];
1075
1076 ch = (uch)ch;
1077 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1078 p->next = bracket;
1079 p->end = bracket + 2;
1080 bracket[0] = ch;
1081 bracket[1] = ']';
1082 bracket[2] = '\0';
1083 p_bracket(p);
1084 assert(p->next == bracket + 2);
1085 p->next = oldnext;
1086 p->end = oldend;
1087}
1088
1089/*
1090 - ordinary - emit an ordinary character
1091 == static void ordinary(register struct parse *p, register int ch);
1092 */
1093static void ordinary(p, ch) register struct parse *p;
1094register int ch;
1095{
1096 register cat_t *cap = p->g->categories;
1097
1098 if ((p->g->cflags & REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1099 bothcases(p, ch);
1100 else {
1101 EMIT(OCHAR, (uch)ch);
1102 if (cap[ch] == 0)
1103 cap[ch] = p->g->ncategories++;
1104 }
1105}
1106
1107/*
1108 - nonnewline - emit REG_NEWLINE version of OANY
1109 == static void nonnewline(register struct parse *p);
1110 *
1111 * Boy, is this implementation ever a kludge...
1112 */
1113static void nonnewline(p) register struct parse *p;
1114{
1115 register char *oldnext = p->next;
1116 register char *oldend = p->end;
1117 char bracket[4];
1118
1119 p->next = bracket;
1120 p->end = bracket + 3;
1121 bracket[0] = '^';
1122 bracket[1] = '\n';
1123 bracket[2] = ']';
1124 bracket[3] = '\0';
1125 p_bracket(p);
1126 assert(p->next == bracket + 3);
1127 p->next = oldnext;
1128 p->end = oldend;
1129}
1130
1131/*
1132 - repeat - generate code for a bounded repetition, recursively if needed
1133 == static void repeat(register struct parse *p, sopno start, int from, int to);
1134 */
1135static void repeat(p, start, from, to) register struct parse *p;
1136sopno start; /* operand from here to end of strip */
1137int from; /* repeated from this number */
1138int to; /* to this number of times (maybe INFINITY) */
1139{
1140 register sopno finish = HERE();
1141#define N 2
1142#define INF 3
1143#define REP(f, t) ((f)*8 + (t))
1144#define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1145 register sopno copy;
1146
1147 if (p->error != 0) /* head off possible runaway recursion */
1148 return;
1149
1150 assert(from <= to);
1151
1152 switch (REP(MAP(from), MAP(to))) {
1153 case REP(0, 0): /* must be user doing this */
1154 DROP(finish - start); /* drop the operand */
1155 break;
1156 case REP(0, 1): /* as x{1,1}? */
1157 case REP(0, N): /* as x{1,n}? */
1158 case REP(0, INF): /* as x{1,}? */
1159 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1160 INSERT(OCH_, start); /* offset is wrong... */
1161 repeat(p, start + 1, 1, to);
1162 ASTERN(OOR1, start);
1163 AHEAD(start); /* ... fix it */
1164 EMIT(OOR2, 0);
1165 AHEAD(THERE());
1166 ASTERN(O_CH, THERETHERE());
1167 break;
1168 case REP(1, 1): /* trivial case */
1169 /* done */
1170 break;
1171 case REP(1, N): /* as x?x{1,n-1} */
1172 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1173 INSERT(OCH_, start);
1174 ASTERN(OOR1, start);
1175 AHEAD(start);
1176 EMIT(OOR2, 0); /* offset very wrong... */
1177 AHEAD(THERE()); /* ...so fix it */
1178 ASTERN(O_CH, THERETHERE());
1179 copy = dupl(p, start + 1, finish + 1);
1180 assert(copy == finish + 4);
1181 repeat(p, copy, 1, to - 1);
1182 break;
1183 case REP(1, INF): /* as x+ */
1184 INSERT(OPLUS_, start);
1185 ASTERN(O_PLUS, start);
1186 break;
1187 case REP(N, N): /* as xx{m-1,n-1} */
1188 copy = dupl(p, start, finish);
1189 repeat(p, copy, from - 1, to - 1);
1190 break;
1191 case REP(N, INF): /* as xx{n-1,INF} */
1192 copy = dupl(p, start, finish);
1193 repeat(p, copy, from - 1, to);
1194 break;
1195 default: /* "can't happen" */
1196 SETERROR(REG_ASSERT); /* just in case */
1197 break;
1198 }
1199}
1200
1201/*
1202 - seterr - set an error condition
1203 == static int seterr(register struct parse *p, int e);
1204 */
1205static int /* useless but makes type checking happy */
1206 seterr(p, e) register struct parse *p;
1207int e;
1208{
1209 if (p->error == 0) /* keep earliest error condition */
1210 p->error = e;
1211 p->next = nuls; /* try to bring things to a halt */
1212 p->end = nuls;
1213 return (0); /* make the return value well-defined */
1214}
1215
1216/*
1217 - allocset - allocate a set of characters for []
1218 == static cset *allocset(register struct parse *p);
1219 */
1220static cset *allocset(p) register struct parse *p;
1221{
1222 register int no = p->g->ncsets++;
1223 register size_t nc;
1224 register size_t nbytes;
1225 register cset *cs;
1226 register size_t css = (size_t)p->g->csetsize;
1227 register int i;
1228
1229 if (no >= p->ncsalloc) { /* need another column of space */
1230 p->ncsalloc += CHAR_BIT;
1231 nc = p->ncsalloc;
1232 assert(nc % CHAR_BIT == 0);
1233 nbytes = nc / CHAR_BIT * css;
1234 if (p->g->sets == NULL)
1235 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1236 else
1237 p->g->sets = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1238 if (p->g->setbits == NULL)
1239 p->g->setbits = (uch *)malloc(nbytes);
1240 else {
1241 p->g->setbits = (uch *)realloc((char *)p->g->setbits, nbytes);
1242 /* xxx this isn't right if setbits is now NULL */
1243 for (i = 0; i < no; i++)
1244 p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT);
1245 }
1246 if (p->g->sets != NULL && p->g->setbits != NULL)
1247 (void)memset((char *)p->g->setbits + (nbytes - css), 0, css);
1248 else {
1249 no = 0;
1250 SETERROR(REG_ESPACE);
1251 /* caller's responsibility not to do set ops */
1252 }
1253 }
1254
1255 assert(p->g->sets != NULL); /* xxx */
1256 cs = &p->g->sets[no];
1257 cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT);
1258 cs->mask = 1 << ((no) % CHAR_BIT);
1259 cs->hash = 0;
1260 cs->smultis = 0;
1261 cs->multis = NULL;
1262
1263 return (cs);
1264}
1265
1266/*
1267 - freeset - free a now-unused set
1268 == static void freeset(register struct parse *p, register cset *cs);
1269 */
1270static void freeset(p, cs) register struct parse *p;
1271register cset *cs;
1272{
1273 register int i;
1274 register cset *top = &p->g->sets[p->g->ncsets];
1275 register size_t css = (size_t)p->g->csetsize;
1276
1277 for (i = 0; i < css; i++)
1278 CHsub(cs, i);
1279 if (cs == top - 1) /* recover only the easy case */
1280 p->g->ncsets--;
1281}
1282
1283/*
1284 - freezeset - final processing on a set of characters
1285 == static int freezeset(register struct parse *p, register cset *cs);
1286 *
1287 * The main task here is merging identical sets. This is usually a waste
1288 * of time (although the hash code minimizes the overhead), but can win
1289 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1290 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1291 * the same value!
1292 */
1293static int /* set number */
1294 freezeset(p, cs) register struct parse *p;
1295register cset *cs;
1296{
1297 register short h = cs->hash;
1298 register int i;
1299 register cset *top = &p->g->sets[p->g->ncsets];
1300 register cset *cs2;
1301 register size_t css = (size_t)p->g->csetsize;
1302
1303 /* look for an earlier one which is the same */
1304 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1305 if (cs2->hash == h && cs2 != cs) {
1306 /* maybe */
1307 for (i = 0; i < css; i++)
1308 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1309 break; /* no */
1310 if (i == css)
1311 break; /* yes */
1312 }
1313
1314 if (cs2 < top) { /* found one */
1315 freeset(p, cs);
1316 cs = cs2;
1317 }
1318
1319 return ((int)(cs - p->g->sets));
1320}
1321
1322/*
1323 - firstch - return first character in a set (which must have at least one)
1324 == static int firstch(register struct parse *p, register cset *cs);
1325 */
1326static int /* character; there is no "none" value */
1327 firstch(p, cs) register struct parse *p;
1328register cset *cs;
1329{
1330 register int i;
1331 register size_t css = (size_t)p->g->csetsize;
1332
1333 for (i = 0; i < css; i++)
1334 if (CHIN(cs, i))
1335 return ((char)i);
1336 assert(never);
1337 return (0); /* arbitrary */
1338}
1339
1340/*
1341 - nch - number of characters in a set
1342 == static int nch(register struct parse *p, register cset *cs);
1343 */
1344static int nch(p, cs) register struct parse *p;
1345register cset *cs;
1346{
1347 register int i;
1348 register size_t css = (size_t)p->g->csetsize;
1349 register int n = 0;
1350
1351 for (i = 0; i < css; i++)
1352 if (CHIN(cs, i))
1353 n++;
1354 return (n);
1355}
1356
1357#if 0
1358/*
1359 - mcadd - add a collating element to a cset
1360 == static void mcadd(register struct parse *p, register cset *cs, \
1361 == register char *cp);
1362 */
1363static void
1364mcadd(p, cs, cp)
1365register struct parse *p;
1366register cset *cs;
1367register char *cp;
1368{
1369 register size_t oldend = cs->smultis;
1370
1371 cs->smultis += strlen(cp) + 1;
1372 if (cs->multis == NULL)
1373 cs->multis = malloc(cs->smultis);
1374 else
1375 cs->multis = realloc(cs->multis, cs->smultis);
1376 if (cs->multis == NULL) {
1377 SETERROR(REG_ESPACE);
1378 return;
1379 }
1380
1381 (void) strcpy(cs->multis + oldend - 1, cp);
1382 cs->multis[cs->smultis - 1] = '\0';
1383}
1384#endif
1385
1386#if used
1387/*
1388 - mcsub - subtract a collating element from a cset
1389 == static void mcsub(register cset *cs, register char *cp);
1390 */
1391static void mcsub(cs, cp) register cset *cs;
1392register char *cp;
1393{
1394 register char *fp = mcfind(cs, cp);
1395 register size_t len = strlen(fp);
1396
1397 assert(fp != NULL);
1398 (void)memmove(fp, fp + len + 1, cs->smultis - (fp + len + 1 - cs->multis));
1399 cs->smultis -= len;
1400
1401 if (cs->smultis == 0) {
1402 free(cs->multis);
1403 cs->multis = NULL;
1404 return;
1405 }
1406
1407 cs->multis = realloc(cs->multis, cs->smultis);
1408 assert(cs->multis != NULL);
1409}
1410
1411/*
1412 - mcin - is a collating element in a cset?
1413 == static int mcin(register cset *cs, register char *cp);
1414 */
1415static int mcin(cs, cp) register cset *cs;
1416register char *cp;
1417{ return (mcfind(cs, cp) != NULL); }
1418
1419/*
1420 - mcfind - find a collating element in a cset
1421 == static char *mcfind(register cset *cs, register char *cp);
1422 */
1423static char *mcfind(cs, cp) register cset *cs;
1424register char *cp;
1425{
1426 register char *p;
1427
1428 if (cs->multis == NULL)
1429 return (NULL);
1430 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1431 if (strcmp(cp, p) == 0)
1432 return (p);
1433 return (NULL);
1434}
1435#endif
1436
1437/*
1438 - mcinvert - invert the list of collating elements in a cset
1439 == static void mcinvert(register struct parse *p, register cset *cs);
1440 *
1441 * This would have to know the set of possibilities. Implementation
1442 * is deferred.
1443 */
1444static void mcinvert(p, cs) register struct parse *p;
1445register cset *cs;
1446{ assert(cs->multis == NULL); /* xxx */ }
1447
1448/*
1449 - mccase - add case counterparts of the list of collating elements in a cset
1450 == static void mccase(register struct parse *p, register cset *cs);
1451 *
1452 * This would have to know the set of possibilities. Implementation
1453 * is deferred.
1454 */
1455static void mccase(p, cs) register struct parse *p;
1456register cset *cs;
1457{ assert(cs->multis == NULL); /* xxx */ }
1458
1459/*
1460 - isinsets - is this character in any sets?
1461 == static int isinsets(register struct re_guts *g, int c);
1462 */
1463static int /* predicate */
1464 isinsets(g, c) register struct re_guts *g;
1465int c;
1466{
1467 register uch *col;
1468 register int i;
1469 register int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
1470 register unsigned uc = (uch)c;
1471
1472 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1473 if (col[uc] != 0)
1474 return (1);
1475 return (0);
1476}
1477
1478/*
1479 - samesets - are these two characters in exactly the same sets?
1480 == static int samesets(register struct re_guts *g, int c1, int c2);
1481 */
1482static int /* predicate */
1483 samesets(g, c1, c2) register struct re_guts *g;
1484int c1;
1485int c2;
1486{
1487 register uch *col;
1488 register int i;
1489 register int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
1490 register unsigned uc1 = (uch)c1;
1491 register unsigned uc2 = (uch)c2;
1492
1493 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1494 if (col[uc1] != col[uc2])
1495 return (0);
1496 return (1);
1497}
1498
1499/*
1500 - categorize - sort out character categories
1501 == static void categorize(struct parse *p, register struct re_guts *g);
1502 */
1503static void categorize(p, g) struct parse *p;
1504register struct re_guts *g;
1505{
1506 register cat_t *cats = g->categories;
1507 register int c;
1508 register int c2;
1509 register cat_t cat;
1510
1511 /* avoid making error situations worse */
1512 if (p->error != 0)
1513 return;
1514
1515 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1516 if (cats[c] == 0 && isinsets(g, c)) {
1517 cat = g->ncategories++;
1518 cats[c] = cat;
1519 for (c2 = c + 1; c2 <= CHAR_MAX; c2++)
1520 if (cats[c2] == 0 && samesets(g, c, c2))
1521 cats[c2] = cat;
1522 }
1523}
1524
1525/*
1526 - dupl - emit a duplicate of a bunch of sops
1527 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1528 */
1529static sopno /* start of duplicate */
1530 dupl(p, start, finish) register struct parse *p;
1531sopno start; /* from here */
1532sopno finish; /* to this less one */
1533{
1534 register sopno ret = HERE();
1535 register sopno len = finish - start;
1536
1537 assert(finish >= start);
1538 if (len == 0)
1539 return (ret);
1540 enlarge(p, p->ssize + len); /* this many unexpected additions */
1541 assert(p->ssize >= p->slen + len);
1542 (void)memmove((char *)(p->strip + p->slen), (char *)(p->strip + start),
1543 (size_t)len * sizeof(sop));
1544 p->slen += len;
1545 return (ret);
1546}
1547
1548/*
1549 - doemit - emit a strip operator
1550 == static void doemit(register struct parse *p, sop op, size_t opnd);
1551 *
1552 * It might seem better to implement this as a macro with a function as
1553 * hard-case backup, but it's just too big and messy unless there are
1554 * some changes to the data structures. Maybe later.
1555 */
1556static void doemit(p, op, opnd) register struct parse *p;
1557sop op;
1558size_t opnd;
1559{
1560 /* avoid making error situations worse */
1561 if (p->error != 0)
1562 return;
1563
1564 /* deal with oversize operands ("can't happen", more or less) */
1565 assert(opnd < 1 << OPSHIFT);
1566
1567 /* deal with undersized strip */
1568 if (p->slen >= p->ssize)
1569 enlarge(p, (p->ssize + 1) / 2 * 3); /* +50% */
1570 assert(p->slen < p->ssize);
1571
1572 /* finally, it's all reduced to the easy case */
1573 p->strip[p->slen++] = SOP(op, opnd);
1574}
1575
1576/*
1577 - doinsert - insert a sop into the strip
1578 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno
1579 pos);
1580 */
1581static void doinsert(p, op, opnd, pos) register struct parse *p;
1582sop op;
1583size_t opnd;
1584sopno pos;
1585{
1586 register sopno sn;
1587 register sop s;
1588 register int i;
1589
1590 /* avoid making error situations worse */
1591 if (p->error != 0)
1592 return;
1593
1594 sn = HERE();
1595 EMIT(op, opnd); /* do checks, ensure space */
1596 assert(HERE() == sn + 1);
1597 s = p->strip[sn];
1598
1599 /* adjust paren pointers */
1600 assert(pos > 0);
1601 for (i = 1; i < NPAREN; i++) {
1602 if (p->pbegin[i] >= pos) {
1603 p->pbegin[i]++;
1604 }
1605 if (p->pend[i] >= pos) {
1606 p->pend[i]++;
1607 }
1608 }
1609
1610 memmove((char *)&p->strip[pos + 1], (char *)&p->strip[pos],
1611 (HERE() - pos - 1) * sizeof(sop));
1612 p->strip[pos] = s;
1613}
1614
1615/*
1616 - dofwd - complete a forward reference
1617 == static void dofwd(register struct parse *p, sopno pos, sop value);
1618 */
1619static void dofwd(p, pos, value) register struct parse *p;
1620register sopno pos;
1621sop value;
1622{
1623 /* avoid making error situations worse */
1624 if (p->error != 0)
1625 return;
1626
1627 assert(value < 1 << OPSHIFT);
1628 p->strip[pos] = OP(p->strip[pos]) | value;
1629}
1630
1631/*
1632 - enlarge - enlarge the strip
1633 == static void enlarge(register struct parse *p, sopno size);
1634 */
1635static void enlarge(p, size) register struct parse *p;
1636register sopno size;
1637{
1638 register sop *sp;
1639
1640 if (p->ssize >= size)
1641 return;
1642
1643 sp = (sop *)realloc(p->strip, size * sizeof(sop));
1644 if (sp == NULL) {
1645 SETERROR(REG_ESPACE);
1646 return;
1647 }
1648 p->strip = sp;
1649 p->ssize = size;
1650}
1651
1652/*
1653 - stripsnug - compact the strip
1654 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1655 */
1656static void stripsnug(p, g) register struct parse *p;
1657register struct re_guts *g;
1658{
1659 g->nstates = p->slen;
1660 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1661 if (g->strip == NULL) {
1662 SETERROR(REG_ESPACE);
1663 g->strip = p->strip;
1664 }
1665}
1666
1667/*
1668 - findmust - fill in must and mlen with longest mandatory literal string
1669 == static void findmust(register struct parse *p, register struct re_guts *g);
1670 *
1671 * This algorithm could do fancy things like analyzing the operands of |
1672 * for common subsequences. Someday. This code is simple and finds most
1673 * of the interesting cases.
1674 *
1675 * Note that must and mlen got initialized during setup.
1676 */
1677static void findmust(p, g) struct parse *p;
1678register struct re_guts *g;
1679{
1680 register sop *scan;
1681 sop *start = NULL;
1682 register sop *newstart = NULL;
1683 register sopno newlen;
1684 register sop s;
1685 register char *cp;
1686 register sopno i;
1687
1688 /* avoid making error situations worse */
1689 if (p->error != 0)
1690 return;
1691
1692 /* find the longest OCHAR sequence in strip */
1693 newlen = 0;
1694 scan = g->strip + 1;
1695 do {
1696 s = *scan++;
1697 switch (OP(s)) {
1698 case OCHAR: /* sequence member */
1699 if (newlen == 0) /* new sequence */
1700 newstart = scan - 1;
1701 newlen++;
1702 break;
1703 case OPLUS_: /* things that don't break one */
1704 case OLPAREN:
1705 case ORPAREN:
1706 break;
1707 case OQUEST_: /* things that must be skipped */
1708 case OCH_:
1709 scan--;
1710 do {
1711 scan += OPND(s);
1712 s = *scan;
1713 /* assert() interferes w debug printouts */
1714 if (OP(s) != O_QUEST && OP(s) != O_CH && OP(s) != OOR2) {
1715 g->iflags |= BAD;
1716 return;
1717 }
1718 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1719 /* fallthrough */
1720 default: /* things that break a sequence */
1721 if (newlen > g->mlen) { /* ends one */
1722 start = newstart;
1723 g->mlen = newlen;
1724 }
1725 newlen = 0;
1726 break;
1727 }
1728 } while (OP(s) != OEND);
1729
1730 if (g->mlen == 0) /* there isn't one */
1731 return;
1732
1733 /* turn it into a character string */
1734 g->must = malloc((size_t)g->mlen + 1);
1735 if (g->must == NULL) { /* argh; just forget it */
1736 g->mlen = 0;
1737 return;
1738 }
1739 cp = g->must;
1740 scan = start;
1741 for (i = g->mlen; i > 0; i--) {
1742 while (OP(s = *scan++) != OCHAR)
1743 continue;
1744 assert(cp < g->must + g->mlen);
1745 *cp++ = (char)OPND(s);
1746 }
1747 assert(cp == g->must + g->mlen);
1748 *cp++ = '\0'; /* just on general principles */
1749}
1750
1751/*
1752 - pluscount - count + nesting
1753 == static sopno pluscount(register struct parse *p, register struct re_guts
1754 *g);
1755 */
1756static sopno /* nesting depth */
1757 pluscount(p, g) struct parse *p;
1758register struct re_guts *g;
1759{
1760 register sop *scan;
1761 register sop s;
1762 register sopno plusnest = 0;
1763 register sopno maxnest = 0;
1764
1765 if (p->error != 0)
1766 return (0); /* there may not be an OEND */
1767
1768 scan = g->strip + 1;
1769 do {
1770 s = *scan++;
1771 switch (OP(s)) {
1772 case OPLUS_:
1773 plusnest++;
1774 break;
1775 case O_PLUS:
1776 if (plusnest > maxnest)
1777 maxnest = plusnest;
1778 plusnest--;
1779 break;
1780 }
1781 } while (OP(s) != OEND);
1782 if (plusnest != 0)
1783 g->iflags |= BAD;
1784 return (maxnest);
1785}
Regexp character classes.
Vharacter names.
Definition: regex2.h:122
Definition: regcomp.c:145