regcomp.c source code [llvm_projects/llvm/lib/Support/regcomp.c]

1	/-*
2	* This code is derived from OpenBSD's libc/regex, original license follows:
3	*
4	* Copyright (c) 1992, 1993, 1994 Henry Spencer.
5	* Copyright (c) 1992, 1993, 1994
6	* The Regents of the University of California. All rights reserved.
7	*
8	* This code is derived from software contributed to Berkeley by
9	* Henry Spencer.
10	*
11	* Redistribution and use in source and binary forms, with or without
12	* modification, are permitted provided that the following conditions
13	* are met:
14	* 1. Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* 2. Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* 3. Neither the name of the University nor the names of its contributors
20	* may be used to endorse or promote products derived from this software
21	* without specific prior written permission.
22	*
23	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33	* SUCH DAMAGE.
34	*
35	* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
36	*/
37
38	#include "regex_impl.h"
39	#include <ctype.h>
40	#include <limits.h>
41	#include <stdint.h>
42	#include <stdio.h>
43	#include <stdlib.h>
44	#include <string.h>
45	#include <sys/types.h>
46
47	#include "regex2.h"
48	#include "regutils.h"
49
50	#include "llvm/Config/config.h"
51	#include "llvm/Support/Compiler.h"
52
53	/ character-class table /
54	static struct cclass {
55	const char *name;
56	const char *chars;
57	const char *multis;
58	} cclasses[] = {
59	{"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
60	0123456789",
61	""},
62	{"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""},
63	{"blank", " \t", ""},
64	{"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
65	\25\26\27\30\31\32\33\34\35\36\37\177",
66	""},
67	{"digit", "0123456789", ""},
68	{"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
69	0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~",
70	""},
71	{"lower", "abcdefghijklmnopqrstuvwxyz", ""},
72	{"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
73	0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~ ",
74	""},
75	{"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~", ""},
76	{"space", "\t\n\v\f\r ", ""},
77	{"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", ""},
78	{"xdigit", "0123456789ABCDEFabcdef", ""},
79	{NULL, `0`, ""}};
80
81	/ character-name table /
82	static struct cname {
83	const char *name;
84	char code;
85	} cnames[] = {{"NUL", `'\0'`},
86	{"SOH", `'\001'`},
87	{"STX", `'\002'`},
88	{"ETX", `'\003'`},
89	{"EOT", `'\004'`},
90	{"ENQ", `'\005'`},
91	{"ACK", `'\006'`},
92	{"BEL", `'\007'`},
93	{"alert", `'\007'`},
94	{"BS", `'\010'`},
95	{"backspace", `'\b'`},
96	{"HT", `'\011'`},
97	{"tab", `'\t'`},
98	{"LF", `'\012'`},
99	{"newline", `'\n'`},
100	{"VT", `'\013'`},
101	{"vertical-tab", `'\v'`},
102	{"FF", `'\014'`},
103	{"form-feed", `'\f'`},
104	{"CR", `'\015'`},
105	{"carriage-return", `'\r'`},
106	{"SO", `'\016'`},
107	{"SI", `'\017'`},
108	{"DLE", `'\020'`},
109	{"DC1", `'\021'`},
110	{"DC2", `'\022'`},
111	{"DC3", `'\023'`},
112	{"DC4", `'\024'`},
113	{"NAK", `'\025'`},
114	{"SYN", `'\026'`},
115	{"ETB", `'\027'`},
116	{"CAN", `'\030'`},
117	{"EM", `'\031'`},
118	{"SUB", `'\032'`},
119	{"ESC", `'\033'`},
120	{"IS4", `'\034'`},
121	{"FS", `'\034'`},
122	{"IS3", `'\035'`},
123	{"GS", `'\035'`},
124	{"IS2", `'\036'`},
125	{"RS", `'\036'`},
126	{"IS1", `'\037'`},
127	{"US", `'\037'`},
128	{"space", `' '`},
129	{"exclamation-mark", `'!'`},
130	{"quotation-mark", `'"'`},
131	{"number-sign", `'#'`},
132	{"dollar-sign", `'$'`},
133	{"percent-sign", `'%'`},
134	{"ampersand", `'&'`},
135	{"apostrophe", `'\''`},
136	{"left-parenthesis", `'('`},
137	{"right-parenthesis", `')'`},
138	{"asterisk", `'*'`},
139	{"plus-sign", `'+'`},
140	{"comma", `','`},
141	{"hyphen", `'-'`},
142	{"hyphen-minus", `'-'`},
143	{"period", `'.'`},
144	{"full-stop", `'.'`},
145	{"slash", `'/'`},
146	{"solidus", `'/'`},
147	{"zero", `'0'`},
148	{"one", `'1'`},
149	{"two", `'2'`},
150	{"three", `'3'`},
151	{"four", `'4'`},
152	{"five", `'5'`},
153	{"six", `'6'`},
154	{"seven", `'7'`},
155	{"eight", `'8'`},
156	{"nine", `'9'`},
157	{"colon", `':'`},
158	{"semicolon", `';'`},
159	{"less-than-sign", `'<'`},
160	{"equals-sign", `'='`},
161	{"greater-than-sign", `'>'`},
162	{"question-mark", `'?'`},
163	{"commercial-at", `'@'`},
164	{"left-square-bracket", `'['`},
165	{"backslash", `'\\'`},
166	{"reverse-solidus", `'\\'`},
167	{"right-square-bracket", `']'`},
168	{"circumflex", `'^'`},
169	{"circumflex-accent", `'^'`},
170	{"underscore", `'_'`},
171	{"low-line", `'_'`},
172	{"grave-accent", '`'},
173	{"left-brace", `'{'`},
174	{"left-curly-bracket", `'{'`},
175	{"vertical-line", `'\|'`},
176	{"right-brace", `'}'`},
177	{"right-curly-bracket", `'}'`},
178	{"tilde", `'~'`},
179	{"DEL", `'\177'`},
180	{NULL, `0`}};
181
182	/*
183	* parse structure, passed up and down to avoid global variables and
184	* other clumsinesses
185	*/
186	struct parse {
187	const char next; /* next character in RE /
188	const char end; /* end of string (-> NUL normally) /
189	int error; / has an error been seen? /
190	sop strip; /* malloced strip /
191	sopno ssize; / malloced strip size (allocated) /
192	sopno slen; / malloced strip length (used) /
193	int ncsalloc; / number of csets allocated /
194	struct re_guts *g;
195	#define NPAREN 10 /* we need to remember () 1-9 for back refs */
196	sopno pbegin[NPAREN]; / -> ( ([0] unused) /
197	sopno pend[NPAREN]; / -> ) ([0] unused) /
198	};
199
200	static void p_ere(struct parse , int*);
201	static void p_ere_exp(struct parse *);
202	static void p_str(struct parse *);
203	static void p_bre(struct parse , int, int*);
204	static int p_simp_re(struct parse , int*);
205	static int p_count(struct parse *);
206	static void p_bracket(struct parse *);
207	static void p_b_term(struct parse , cset );
208	static void p_b_cclass(struct parse , cset );
209	static void p_b_eclass(struct parse , cset );
210	static char p_b_symbol(struct parse *);
211	static char p_b_coll_elem(struct parse , int*);
212	static char othercase(int);
213	static void bothcases(struct parse , int*);
214	static void ordinary(struct parse , int*);
215	static void nonnewline(struct parse *);
216	static void repeat(struct parse , sopno, int, int*);
217	static int seterr(struct parse , int*);
218	static cset allocset(struct* parse *);
219	static void freeset(struct parse , cset );
220	static int freezeset(struct parse , cset );
221	static int firstch(struct parse , cset );
222	static int nch(struct parse , cset );
223	static void mcadd(struct parse , cset , const char *);
224	static void mcinvert(struct parse , cset );
225	static void mccase(struct parse , cset );
226	static int isinsets(struct re_guts , int*);
227	static int samesets(struct re_guts , int, int*);
228	static void categorize(struct parse , struct* re_guts *);
229	static sopno dupl(struct parse *, sopno, sopno);
230	static void doemit(struct parse *, sop, size_t);
231	static void doinsert(struct parse *, sop, size_t, sopno);
232	static void dofwd(struct parse *, sopno, sop);
233	static void enlarge(struct parse *, sopno);
234	static void stripsnug(struct parse , struct* re_guts *);
235	static void findmust(struct parse , struct* re_guts *);
236	static sopno pluscount(struct parse , struct* re_guts *);
237
238	static char nuls[`10`]; / place to point scanner in event of error /
239
240	/*
241	* macros for use with parse structure
242	* BEWARE: these know that the parse structure is named `p' !!!
243	*/
244	#define PEEK() (*p->next)
245	#define PEEK2() (*(p->next + 1))
246	#define MORE() (p->end - p->next > 0)
247	#define MORE2() (p->end - p->next > 1)
248	#define SEE(c) (MORE() && PEEK() == (c))
249	#define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b))
250	#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
251	#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
252	#define NEXT() (p->next++)
253	#define NEXT2() (p->next += 2)
254	#define NEXTn(n) (p->next += (n))
255	#define GETNEXT() (*p->next++)
256	#define SETERROR(e) seterr(p, (e))
257	#define REQUIRE(co, e) (void)((co) \|\| SETERROR(e))
258	#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
259	#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
260	#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() \|\| PEEK() != (c), e))
261	#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
262	#define INSERT(op, pos) doinsert(p, (sop)(op), HERE() - (pos) + 1, pos)
263	#define AHEAD(pos) dofwd(p, pos, HERE() - (pos))
264	#define ASTERN(sop, pos) EMIT(sop, HERE() - pos)
265	#define HERE() (p->slen)
266	#define THERE() (p->slen - 1)
267	#define THERETHERE() (p->slen - 2)
268	#define DROP(n) (p->slen -= (n))
269
270	#ifdef _POSIX2_RE_DUP_MAX
271	#define DUPMAX _POSIX2_RE_DUP_MAX
272	#else
273	#define DUPMAX 255
274	#endif
275	#define REGINFINITY (DUPMAX + 1)
276
277	#ifndef NDEBUG
278	static int never = `0`; / for use in asserts; shuts lint up /
279	#else
280	#define never 0 /* some <assert.h>s have bugs too */
281	#endif
282
283	/*
284	- llvm_regcomp - interface for parser and compilation
285	*/
286	int / 0 success, otherwise REG_something /
287	llvm_regcomp(llvm_regex_t preg, const* char pattern, int* cflags) {
288	struct parse pa;
289	struct re_guts *g;
290	struct parse *p = &pa;
291	int i;
292	size_t len;
293	#ifdef REDEBUG
294	#define GOODFLAGS(f) (f)
295	#else
296	#define GOODFLAGS(f) ((f) & ~REG_DUMP)
297	#endif
298
299	cflags = GOODFLAGS(cflags);
300	if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC))
301	return (REG_INVARG);
302
303	if (cflags & REG_PEND) {
304	if (preg->re_endp < pattern)
305	return (REG_INVARG);
306	len = preg->re_endp - pattern;
307	} else {
308	len = strlen(s: (const char *)pattern);
309	}
310
311	/ do the mallocs early so failure handling is easy /
312	g = (struct re_guts )malloc(size: sizeof(struct* re_guts) +
313	(NC - `1`) * sizeof(cat_t));
314	if (g == NULL)
315	return (REG_ESPACE);
316	p->ssize = len / (size_t)`2` * (size_t)`3` + (size_t)`1`; / ugh /
317	p->strip = (sop )calloc(nmemb: p->ssize, size: sizeof*(sop));
318	p->slen = `0`;
319	if (p->strip == NULL) {
320	free(ptr: (char *)g);
321	return (REG_ESPACE);
322	}
323
324	/ set things up /
325	p->g = g;
326	p->next = pattern;
327	p->end = p->next + len;
328	p->error = `0`;
329	p->ncsalloc = `0`;
330	for (i = `0`; i < NPAREN; i++) {
331	p->pbegin[i] = `0`;
332	p->pend[i] = `0`;
333	}
334	g->csetsize = NC;
335	g->sets = NULL;
336	g->setbits = NULL;
337	g->ncsets = `0`;
338	g->cflags = cflags;
339	g->iflags = `0`;
340	g->nbol = `0`;
341	g->neol = `0`;
342	g->must = NULL;
343	g->mlen = `0`;
344	g->nsub = `0`;
345	g->ncategories = `1`; / category 0 is "everything else" /
346	g->categories = &g->catspace[-(CHAR_MIN)];
347	(void)memset(s: (char )g->catspace, c: `0`, NC sizeof(cat_t));
348	g->backrefs = `0`;
349
350	/ do it /
351	EMIT(OEND, `0`);
352	g->firststate = THERE();
353	if (cflags & REG_EXTENDED)
354	p_ere(p, OUT);
355	else if (cflags & REG_NOSPEC)
356	p_str(p);
357	else
358	p_bre(p, OUT, OUT);
359	EMIT(OEND, `0`);
360	g->laststate = THERE();
361
362	/ tidy up loose ends and fill things in /
363	categorize(p, g);
364	stripsnug(p, g);
365	findmust(p, g);
366	g->nplus = pluscount(p, g);
367	g->magic = MAGIC2;
368	preg->re_nsub = g->nsub;
369	preg->re_g = g;
370	preg->re_magic = MAGIC1;
371	#ifndef REDEBUG
372	/ not debugging, so can't rely on the assert() in llvm_regexec() /
373	if (g->iflags & REGEX_BAD)
374	SETERROR(REG_ASSERT);
375	#endif
376
377	/ win or lose, we're done /
378	if (p->error != `0`) / lose /
379	llvm_regfree(preg);
380	return (p->error);
381	}
382
383	/*
384	- p_ere - ERE parser top level, concatenation and alternation
385	*/
386	static void p_ere(struct parse *p,
387	int stop) / character this ERE should end at /
388	{
389	char c;
390	sopno prevback = `0`;
391	sopno prevfwd = `0`;
392	sopno conc;
393	int first = `1`; / is this the first alternative? /
394
395	for (;;) {
396	/ do a bunch of concatenated expressions /
397	conc = HERE();
398	while (MORE() && (c = PEEK()) != `'\|'` && c != stop)
399	p_ere_exp(p);
400	REQUIRE(HERE() != conc, REG_EMPTY); / require nonempty /
401
402	if (!EAT(`'\|'`))
403	break; / NOTE BREAK OUT /
404
405	if (first) {
406	INSERT(OCH_, conc); / offset is wrong /
407	prevfwd = conc;
408	prevback = conc;
409	first = `0`;
410	}
411	ASTERN(OOR1, prevback);
412	prevback = THERE();
413	AHEAD(prevfwd); / fix previous offset /
414	prevfwd = HERE();
415	EMIT(OOR2, `0`); / offset is very wrong /
416	}
417
418	if (!first) { / tail-end fixups /
419	AHEAD(prevfwd);
420	ASTERN(O_CH, prevback);
421	}
422
423	assert(!MORE() \|\| SEE(stop));
424	}
425
426	/*
427	- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
428	*/
429	static void p_ere_exp(struct parse *p) {
430	char c;
431	sopno pos;
432	int count;
433	int count2;
434	int backrefnum;
435	sopno subno;
436	int wascaret = `0`;
437
438	assert(MORE()); / caller should have ensured this /
439	c = GETNEXT();
440
441	pos = HERE();
442	switch (c) {
443	case `'('`:
444	REQUIRE(MORE(), REG_EPAREN);
445	p->g->nsub++;
446	subno = p->g->nsub;
447	if (subno < NPAREN)
448	p->pbegin[subno] = HERE();
449	EMIT(OLPAREN, subno);
450	if (!SEE(`')'`))
451	p_ere(p, stop: `')'`);
452	if (subno < NPAREN) {
453	p->pend[subno] = HERE();
454	assert(p->pend[subno] != `0`);
455	}
456	EMIT(ORPAREN, subno);
457	MUSTEAT(`')'`, REG_EPAREN);
458	break;
459	#ifndef POSIX_MISTAKE
460	case `')'`: / happens only if no current unmatched ( /
461	/*
462	* You may ask, why the ifndef? Because I didn't notice
463	* this until slightly too late for 1003.2, and none of the
464	* other 1003.2 regular-expression reviewers noticed it at
465	* all. So an unmatched ) is legal POSIX, at least until
466	* we can get it fixed.
467	*/
468	SETERROR(REG_EPAREN);
469	break;
470	#endif
471	case `'^'`:
472	EMIT(OBOL, `0`);
473	p->g->iflags \|= USEBOL;
474	p->g->nbol++;
475	wascaret = `1`;
476	break;
477	case `'$'`:
478	EMIT(OEOL, `0`);
479	p->g->iflags \|= USEEOL;
480	p->g->neol++;
481	break;
482	case `'\|'`:
483	SETERROR(REG_EMPTY);
484	break;
485	case `'*'`:
486	case `'+'`:
487	case `'?'`:
488	SETERROR(REG_BADRPT);
489	break;
490	case `'.'`:
491	if (p->g->cflags & REG_NEWLINE)
492	nonnewline(p);
493	else
494	EMIT(OANY, `0`);
495	break;
496	case `'['`:
497	p_bracket(p);
498	break;
499	case `'\\'`:
500	REQUIRE(MORE(), REG_EESCAPE);
501	c = GETNEXT();
502	if (c >= `'1'` && c <= `'9'`) {
503	/ \[0-9] is taken to be a back-reference to a previously specified*
504	* matching group. backrefnum will hold the number. The matching
505	* group must exist (i.e. if \4 is found there must have been at
506	* least 4 matching groups specified in the pattern previously).
507	*/
508	backrefnum = c - `'0'`;
509	if (p->pend[backrefnum] == `0`) {
510	SETERROR(REG_ESUBREG);
511	break;
512	}
513
514	/ Make sure everything checks out and emit the sequence*
515	* that marks a back-reference to the parse structure.
516	*/
517	assert(backrefnum <= p->g->nsub);
518	EMIT(OBACK_, backrefnum);
519	assert(p->pbegin[backrefnum] != `0`);
520	assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
521	assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
522	(void)dupl(p, p->pbegin[backrefnum] + `1`, p->pend[backrefnum]);
523	EMIT(O_BACK, backrefnum);
524	p->g->backrefs = `1`;
525	} else {
526	/ Other chars are simply themselves when escaped with a backslash.*
527	*/
528	ordinary(p, c);
529	}
530	break;
531	case `'{'`: / okay as ordinary except if digit follows /
532	REQUIRE(!MORE() \|\| !isdigit((uch)PEEK()), REG_BADRPT);
533	LLVM_FALLTHROUGH;
534	default:
535	ordinary(p, c);
536	break;
537	}
538
539	if (!MORE())
540	return;
541	c = PEEK();
542	/ we call { a repetition if followed by a digit /
543	if (!(c == `'*'` \|\| c == `'+'` \|\| c == `'?'` \|\|
544	(c == `'{'` && MORE2() && isdigit((uch)PEEK2()))))
545	return; / no repetition, we're done /
546	NEXT();
547
548	REQUIRE(!wascaret, REG_BADRPT);
549	switch (c) {
550	case `''`: /* implemented as +? /
551	/ this case does not require the (y\|) trick, noKLUDGE /
552	INSERT(OPLUS_, pos);
553	ASTERN(O_PLUS, pos);
554	INSERT(OQUEST_, pos);
555	ASTERN(O_QUEST, pos);
556	break;
557	case `'+'`:
558	INSERT(OPLUS_, pos);
559	ASTERN(O_PLUS, pos);
560	break;
561	case `'?'`:
562	/ KLUDGE: emit y? as (y\|) until subtle bug gets fixed /
563	INSERT(OCH_, pos); / offset slightly wrong /
564	ASTERN(OOR1, pos); / this one's right /
565	AHEAD(pos); / fix the OCH_ /
566	EMIT(OOR2, `0`); / offset very wrong... /
567	AHEAD(THERE()); / ...so fix it /
568	ASTERN(O_CH, THERETHERE());
569	break;
570	case `'{'`:
571	count = p_count(p);
572	if (EAT(`','`)) {
573	if (isdigit((uch)PEEK())) {
574	count2 = p_count(p);
575	REQUIRE(count <= count2, REG_BADBR);
576	} else / single number with comma /
577	count2 = REGINFINITY;
578	} else / just a single number /
579	count2 = count;
580	repeat(p, pos, count, count2);
581	if (!EAT(`'}'`)) { / error heuristics /
582	while (MORE() && PEEK() != `'}'`)
583	NEXT();
584	REQUIRE(MORE(), REG_EBRACE);
585	SETERROR(REG_BADBR);
586	}
587	break;
588	}
589
590	if (!MORE())
591	return;
592	c = PEEK();
593	if (!(c == `'*'` \|\| c == `'+'` \|\| c == `'?'` \|\|
594	(c == `'{'` && MORE2() && isdigit((uch)PEEK2()))))
595	return;
596	SETERROR(REG_BADRPT);
597	}
598
599	/*
600	- p_str - string (no metacharacters) "parser"
601	*/
602	static void p_str(struct parse *p) {
603	REQUIRE(MORE(), REG_EMPTY);
604	while (MORE())
605	ordinary(p, GETNEXT());
606	}
607
608	/*
609	- p_bre - BRE parser top level, anchoring and concatenation
610	* Giving end1 as OUT essentially eliminates the end1/end2 check.
611	*
612	* This implementation is a bit of a kludge, in that a trailing $ is first
613	* taken as an ordinary character and then revised to be an anchor. The
614	* only undesirable side effect is that '$' gets included as a character
615	* category in such cases. This is fairly harmless; not worth fixing.
616	* The amount of lookahead needed to avoid this kludge is excessive.
617	*/
618	static void p_bre(struct parse p, int* end1, / first terminating character /
619	int end2) / second terminating character /
620	{
621	sopno start = HERE();
622	int first = `1`; / first subexpression? /
623	int wasdollar = `0`;
624
625	if (EAT(`'^'`)) {
626	EMIT(OBOL, `0`);
627	p->g->iflags \|= USEBOL;
628	p->g->nbol++;
629	}
630	while (MORE() && !SEETWO(end1, end2)) {
631	wasdollar = p_simp_re(p, first);
632	first = `0`;
633	}
634	if (wasdollar) { / oops, that was a trailing anchor /
635	DROP(`1`);
636	EMIT(OEOL, `0`);
637	p->g->iflags \|= USEEOL;
638	p->g->neol++;
639	}
640
641	REQUIRE(HERE() != start, REG_EMPTY); / require nonempty /
642	}
643
644	/*
645	- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
646	*/
647	static int / was the simple RE an unbackslashed $? /
648	p_simp_re(struct parse *p,
649	int starordinary) / is a leading * an ordinary character? /
650	{
651	int c;
652	int count;
653	int count2;
654	sopno pos;
655	int i;
656	sopno subno;
657	#define BACKSL (1 << CHAR_BIT)
658
659	pos = HERE(); / repetition op, if any, covers from here /
660
661	assert(MORE()); / caller should have ensured this /
662	c = GETNEXT();
663	if (c == `'\\'`) {
664	REQUIRE(MORE(), REG_EESCAPE);
665	c = BACKSL \| GETNEXT();
666	}
667	switch (c) {
668	case `'.'`:
669	if (p->g->cflags & REG_NEWLINE)
670	nonnewline(p);
671	else
672	EMIT(OANY, `0`);
673	break;
674	case `'['`:
675	p_bracket(p);
676	break;
677	case BACKSL \| `'{'`:
678	SETERROR(REG_BADRPT);
679	break;
680	case BACKSL \| `'('`:
681	p->g->nsub++;
682	subno = p->g->nsub;
683	if (subno < NPAREN)
684	p->pbegin[subno] = HERE();
685	EMIT(OLPAREN, subno);
686	/ the MORE here is an error heuristic /
687	if (MORE() && !SEETWO(`'\\'`, `')'`))
688	p_bre(p, end1: `'\\'`, end2: `')'`);
689	if (subno < NPAREN) {
690	p->pend[subno] = HERE();
691	assert(p->pend[subno] != `0`);
692	}
693	EMIT(ORPAREN, subno);
694	REQUIRE(EATTWO(`'\\'`, `')'`), REG_EPAREN);
695	break;
696	case BACKSL \| `')'`: / should not get here -- must be user /
697	case BACKSL \| `'}'`:
698	SETERROR(REG_EPAREN);
699	break;
700	case BACKSL \| `'1'`:
701	case BACKSL \| `'2'`:
702	case BACKSL \| `'3'`:
703	case BACKSL \| `'4'`:
704	case BACKSL \| `'5'`:
705	case BACKSL \| `'6'`:
706	case BACKSL \| `'7'`:
707	case BACKSL \| `'8'`:
708	case BACKSL \| `'9'`:
709	i = (c & ~BACKSL) - `'0'`;
710	assert(i < NPAREN);
711	if (p->pend[i] != `0`) {
712	assert(i <= p->g->nsub);
713	EMIT(OBACK_, i);
714	assert(p->pbegin[i] != `0`);
715	assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
716	assert(OP(p->strip[p->pend[i]]) == ORPAREN);
717	(void)dupl(p, p->pbegin[i] + `1`, p->pend[i]);
718	EMIT(O_BACK, i);
719	} else {
720	SETERROR(REG_ESUBREG);
721	}
722	p->g->backrefs = `1`;
723	break;
724	case `'*'`:
725	REQUIRE(starordinary, REG_BADRPT);
726	LLVM_FALLTHROUGH;
727	default:
728	ordinary(p, (char)c);
729	break;
730	}
731
732	if (EAT(`''`)) { /* implemented as +? /
733	/ this case does not require the (y\|) trick, noKLUDGE /
734	INSERT(OPLUS_, pos);
735	ASTERN(O_PLUS, pos);
736	INSERT(OQUEST_, pos);
737	ASTERN(O_QUEST, pos);
738	} else if (EATTWO(`'\\'`, `'{'`)) {
739	count = p_count(p);
740	if (EAT(`','`)) {
741	if (MORE() && isdigit((uch)PEEK())) {
742	count2 = p_count(p);
743	REQUIRE(count <= count2, REG_BADBR);
744	} else / single number with comma /
745	count2 = REGINFINITY;
746	} else / just a single number /
747	count2 = count;
748	repeat(p, pos, count, count2);
749	if (!EATTWO(`'\\'`, `'}'`)) { / error heuristics /
750	while (MORE() && !SEETWO(`'\\'`, `'}'`))
751	NEXT();
752	REQUIRE(MORE(), REG_EBRACE);
753	SETERROR(REG_BADBR);
754	}
755	} else if (c == `'$'`) / $ (but not \$) ends it /
756	return (`1`);
757
758	return (`0`);
759	}
760
761	/*
762	- p_count - parse a repetition count
763	*/
764	static int / the value /
765	p_count(struct parse *p) {
766	int count = `0`;
767	int ndigits = `0`;
768
769	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
770	count = count * `10` + (GETNEXT() - `'0'`);
771	ndigits++;
772	}
773
774	REQUIRE(ndigits > `0` && count <= DUPMAX, REG_BADBR);
775	return (count);
776	}
777
778	/*
779	- p_bracket - parse a bracketed character list
780	*
781	* Note a significant property of this code: if the allocset() did SETERROR,
782	* no set operations are done.
783	*/
784	static void p_bracket(struct parse *p) {
785	cset *cs;
786	int invert = `0`;
787
788	/ Dept of Truly Sickening Special-Case Kludges /
789	if (p->end - p->next > `5`) {
790	if (strncmp(s1: p->next, s2: "[:<:]]", n: `6`) == `0`) {
791	EMIT(OBOW, `0`);
792	NEXTn(`6`);
793	return;
794	}
795	if (strncmp(s1: p->next, s2: "[:>:]]", n: `6`) == `0`) {
796	EMIT(OEOW, `0`);
797	NEXTn(`6`);
798	return;
799	}
800	}
801
802	if ((cs = allocset(p)) == NULL) {
803	/ allocset did set error status in p /
804	return;
805	}
806
807	if (EAT(`'^'`))
808	invert++; / make note to invert set at end /
809	if (EAT(`']'`))
810	CHadd(cs, `']'`);
811	else if (EAT(`'-'`))
812	CHadd(cs, `'-'`);
813	while (MORE() && PEEK() != `']'` && !SEETWO(`'-'`, `']'`))
814	p_b_term(p, cs);
815	if (EAT(`'-'`))
816	CHadd(cs, `'-'`);
817	MUSTEAT(`']'`, REG_EBRACK);
818
819	if (p->error != `0`) { / don't mess things up further /
820	freeset(p, cs);
821	return;
822	}
823
824	if (p->g->cflags & REG_ICASE) {
825	int i;
826	int ci;
827
828	for (i = p->g->csetsize - `1`; i >= `0`; i--)
829	if (CHIN(cs, i) && isalpha(i)) {
830	ci = othercase(i);
831	if (ci != i)
832	CHadd(cs, ci);
833	}
834	if (cs->multis != NULL)
835	mccase(p, cs);
836	}
837	if (invert) {
838	int i;
839
840	for (i = p->g->csetsize - `1`; i >= `0`; i--)
841	if (CHIN(cs, i))
842	CHsub(cs, i);
843	else
844	CHadd(cs, i);
845	if (p->g->cflags & REG_NEWLINE)
846	CHsub(cs, `'\n'`);
847	if (cs->multis != NULL)
848	mcinvert(p, cs);
849	}
850
851	assert(cs->multis == NULL); / xxx /
852
853	if (nch(p, cs) == `1`) { / optimize singleton sets /
854	ordinary(p, firstch(p, cs));
855	freeset(p, cs);
856	} else {
857	EMIT(OANYOF, freezeset(p, cs));
858	}
859	}
860
861	/*
862	- p_b_term - parse one term of a bracketed character list
863	*/
864	static void p_b_term(struct parse p, cset cs) {
865	char c;
866	char start, finish;
867	int i;
868
869	/ classify what we've got /
870	switch ((MORE()) ? PEEK() : `'\0'`) {
871	case `'['`:
872	c = (MORE2()) ? PEEK2() : `'\0'`;
873	break;
874	case `'-'`:
875	SETERROR(REG_ERANGE);
876	return; / NOTE RETURN /
877	break;
878	default:
879	c = `'\0'`;
880	break;
881	}
882
883	switch (c) {
884	case `':'`: / character class /
885	NEXT2();
886	REQUIRE(MORE(), REG_EBRACK);
887	c = PEEK();
888	REQUIRE(c != `'-'` && c != `']'`, REG_ECTYPE);
889	p_b_cclass(p, cs);
890	REQUIRE(MORE(), REG_EBRACK);
891	REQUIRE(EATTWO(`':'`, `']'`), REG_ECTYPE);
892	break;
893	case `'='`: / equivalence class /
894	NEXT2();
895	REQUIRE(MORE(), REG_EBRACK);
896	c = PEEK();
897	REQUIRE(c != `'-'` && c != `']'`, REG_ECOLLATE);
898	p_b_eclass(p, cs);
899	REQUIRE(MORE(), REG_EBRACK);
900	REQUIRE(EATTWO(`'='`, `']'`), REG_ECOLLATE);
901	break;
902	default: / symbol, ordinary character, or range /
903	/ xxx revision needed for multichar stuff /
904	start = p_b_symbol(p);
905	if (SEE(`'-'`) && MORE2() && PEEK2() != `']'`) {
906	/ range /
907	NEXT();
908	if (EAT(`'-'`))
909	finish = `'-'`;
910	else
911	finish = p_b_symbol(p);
912	} else {
913	finish = start;
914	}
915	/ xxx what about signed chars here... /
916	REQUIRE(start <= finish, REG_ERANGE);
917	for (i = start; i <= finish; i++)
918	CHadd(cs, i);
919	break;
920	}
921	}
922
923	/*
924	- p_b_cclass - parse a character-class name and deal with it
925	*/
926	static void p_b_cclass(struct parse p, cset cs) {
927	const char *sp = p->next;
928	struct cclass *cp;
929	size_t len;
930	const char *u;
931	char c;
932
933	while (MORE() && isalpha((uch)PEEK()))
934	NEXT();
935	len = p->next - sp;
936	for (cp = cclasses; cp->name != NULL; cp++)
937	if (strncmp(s1: cp->name, s2: sp, n: len) == `0` && cp->name[len] == `'\0'`)
938	break;
939	if (cp->name == NULL) {
940	/ oops, didn't find it /
941	SETERROR(REG_ECTYPE);
942	return;
943	}
944
945	u = cp->chars;
946	while ((c = *u++) != `'\0'`)
947	CHadd(cs, c);
948	for (u = cp->multis; *u != `'\0'`; u += strlen(s: u) + `1`)
949	MCadd(p, cs, u);
950	}
951
952	/*
953	- p_b_eclass - parse an equivalence-class name and deal with it
954	*
955	* This implementation is incomplete. xxx
956	*/
957	static void p_b_eclass(struct parse p, cset cs) {
958	char c;
959
960	c = p_b_coll_elem(p, `'='`);
961	CHadd(cs, c);
962	}
963
964	/*
965	- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
966	*/
967	static char / value of symbol /
968	p_b_symbol(struct parse *p) {
969	char value;
970
971	REQUIRE(MORE(), REG_EBRACK);
972	if (!EATTWO(`'['`, `'.'`))
973	return (GETNEXT());
974
975	/ collating symbol /
976	value = p_b_coll_elem(p, `'.'`);
977	REQUIRE(EATTWO(`'.'`, `']'`), REG_ECOLLATE);
978	return (value);
979	}
980
981	/*
982	- p_b_coll_elem - parse a collating-element name and look it up
983	*/
984	static char / value of collating element /
985	p_b_coll_elem(struct parse p, int* endc) / name ended by endc,']' /
986	{
987	const char *sp = p->next;
988	struct cname *cp;
989	size_t len;
990
991	while (MORE() && !SEETWO(endc, `']'`))
992	NEXT();
993	if (!MORE()) {
994	SETERROR(REG_EBRACK);
995	return (`0`);
996	}
997	len = p->next - sp;
998	for (cp = cnames; cp->name != NULL; cp++)
999	if (strncmp(s1: cp->name, s2: sp, n: len) == `0` && strlen(s: cp->name) == len)
1000	return (cp->code); / known name /
1001	if (len == `1`)
1002	return (sp); /* single character /
1003	SETERROR(REG_ECOLLATE); / neither /
1004	return (`0`);
1005	}
1006
1007	/*
1008	- othercase - return the case counterpart of an alphabetic
1009	*/
1010	static char / if no counterpart, return ch /
1011	othercase(int ch) {
1012	ch = (uch)ch;
1013	assert(isalpha(ch));
1014	if (isupper(ch))
1015	return ((uch)tolower(ch));
1016	else if (islower(ch))
1017	return ((uch)toupper(ch));
1018	else / peculiar, but could happen /
1019	return (ch);
1020	}
1021
1022	/*
1023	- bothcases - emit a dualcase version of a two-case character
1024	*
1025	* Boy, is this implementation ever a kludge...
1026	*/
1027	static void bothcases(struct parse p, int* ch) {
1028	const char *oldnext = p->next;
1029	const char *oldend = p->end;
1030	char bracket[`3`];
1031
1032	ch = (uch)ch;
1033	assert(othercase(ch) != ch); / p_bracket() would recurse /
1034	p->next = bracket;
1035	p->end = bracket + `2`;
1036	bracket[`0`] = ch;
1037	bracket[`1`] = `']'`;
1038	bracket[`2`] = `'\0'`;
1039	p_bracket(p);
1040	assert(p->next == bracket + `2`);
1041	p->next = oldnext;
1042	p->end = oldend;
1043	}
1044
1045	/*
1046	- ordinary - emit an ordinary character
1047	*/
1048	static void ordinary(struct parse p, int* ch) {
1049	cat_t *cap = p->g->categories;
1050
1051	if ((p->g->cflags & REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1052	bothcases(p, ch);
1053	else {
1054	EMIT(OCHAR, (uch)ch);
1055	if (cap[ch] == `0`)
1056	cap[ch] = p->g->ncategories++;
1057	}
1058	}
1059
1060	/*
1061	- nonnewline - emit REG_NEWLINE version of OANY
1062	*
1063	* Boy, is this implementation ever a kludge...
1064	*/
1065	static void nonnewline(struct parse *p) {
1066	const char *oldnext = p->next;
1067	const char *oldend = p->end;
1068	static const char bracket[`4`] = {`'^'`, `'\n'`, `']'`, `'\0'`};
1069
1070	p->next = bracket;
1071	p->end = bracket + `3`;
1072	p_bracket(p);
1073	assert(p->next == bracket + `3`);
1074	p->next = oldnext;
1075	p->end = oldend;
1076	}
1077
1078	/*
1079	- repeat - generate code for a bounded repetition, recursively if needed
1080	*/
1081	static void repeat(struct parse *p,
1082	sopno start, / operand from here to end of strip /
1083	int from, / repeated from this number /
1084	int to) / to this number of times (maybe INFINITY) /
1085	{
1086	sopno finish = HERE();
1087	#define N 2
1088	#define INF 3
1089	#define REP(f, t) ((f) * 8 + (t))
1090	#define MAP(n) (((n) <= 1) ? (n) : ((n) == REGINFINITY) ? INF : N)
1091	sopno copy;
1092
1093	if (p->error != `0`) / head off possible runaway recursion /
1094	return;
1095
1096	assert(from <= to);
1097
1098	switch (REP(MAP(from), MAP(to))) {
1099	case REP(`0`, `0`): / must be user doing this /
1100	DROP(finish - start); / drop the operand /
1101	break;
1102	case REP(`0`, `1`): / as x{1,1}? /
1103	case REP(`0`, N): / as x{1,n}? /
1104	case REP(`0`, INF): / as x{1,}? /
1105	/ KLUDGE: emit y? as (y\|) until subtle bug gets fixed /
1106	INSERT(OCH_, start); / offset is wrong... /
1107	repeat(p, start: start + `1`, from: `1`, to);
1108	ASTERN(OOR1, start);
1109	AHEAD(start); / ... fix it /
1110	EMIT(OOR2, `0`);
1111	AHEAD(THERE());
1112	ASTERN(O_CH, THERETHERE());
1113	break;
1114	case REP(`1`, `1`): / trivial case /
1115	/ done /
1116	break;
1117	case REP(`1`, N): / as x?x{1,n-1} /
1118	/ KLUDGE: emit y? as (y\|) until subtle bug gets fixed /
1119	INSERT(OCH_, start);
1120	ASTERN(OOR1, start);
1121	AHEAD(start);
1122	EMIT(OOR2, `0`); / offset very wrong... /
1123	AHEAD(THERE()); / ...so fix it /
1124	ASTERN(O_CH, THERETHERE());
1125	copy = dupl(p, start + `1`, finish + `1`);
1126	assert(copy == finish + `4`);
1127	repeat(p, start: copy, from: `1`, to: to - `1`);
1128	break;
1129	case REP(`1`, INF): / as x+ /
1130	INSERT(OPLUS_, start);
1131	ASTERN(O_PLUS, start);
1132	break;
1133	case REP(N, N): / as xx{m-1,n-1} /
1134	copy = dupl(p, start, finish);
1135	repeat(p, start: copy, from: from - `1`, to: to - `1`);
1136	break;
1137	case REP(N, INF): / as xx{n-1,INF} /
1138	copy = dupl(p, start, finish);
1139	repeat(p, start: copy, from: from - `1`, to);
1140	break;
1141	default: / "can't happen" /
1142	SETERROR(REG_ASSERT); / just in case /
1143	break;
1144	}
1145	}
1146
1147	/*
1148	- seterr - set an error condition
1149	*/
1150	static int / useless but makes type checking happy /
1151	seterr(struct parse p, int* e) {
1152	if (p->error == `0`) / keep earliest error condition /
1153	p->error = e;
1154	p->next = nuls; / try to bring things to a halt /
1155	p->end = nuls;
1156	return (`0`); / make the return value well-defined /
1157	}
1158
1159	/*
1160	- allocset - allocate a set of characters for []
1161	*/
1162	static cset allocset(struct* parse *p) {
1163	int no = p->g->ncsets++;
1164	size_t nc;
1165	size_t nbytes;
1166	cset *cs;
1167	size_t css = (size_t)p->g->csetsize;
1168	int i;
1169
1170	if (no >= p->ncsalloc) { / need another column of space /
1171	void *ptr;
1172
1173	p->ncsalloc += CHAR_BIT;
1174	nc = p->ncsalloc;
1175	if (nc > SIZE_MAX / sizeof(cset))
1176	goto nomem;
1177	assert(nc % CHAR_BIT == `0`);
1178	nbytes = nc / CHAR_BIT * css;
1179
1180	ptr = (cset )realloc(ptr: (char* )p->g->sets, size: nc sizeof(cset));
1181	if (ptr == NULL)
1182	goto nomem;
1183	p->g->sets = ptr;
1184
1185	ptr = (uch )realloc(ptr: (char* *)p->g->setbits, size: nbytes);
1186	if (ptr == NULL)
1187	goto nomem;
1188	p->g->setbits = ptr;
1189
1190	for (i = `0`; i < no; i++)
1191	p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT);
1192
1193	(void)memset(s: (char *)p->g->setbits + (nbytes - css), c: `0`, n: css);
1194	}
1195	/ XXX should not happen /
1196	if (p->g->sets == NULL \|\| p->g->setbits == NULL)
1197	goto nomem;
1198
1199	cs = &p->g->sets[no];
1200	cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT);
1201	cs->mask = `1` << ((no) % CHAR_BIT);
1202	cs->hash = `0`;
1203	cs->smultis = `0`;
1204	cs->multis = NULL;
1205
1206	return (cs);
1207	nomem:
1208	free(ptr: p->g->sets);
1209	p->g->sets = NULL;
1210	free(ptr: p->g->setbits);
1211	p->g->setbits = NULL;
1212
1213	SETERROR(REG_ESPACE);
1214	/ caller's responsibility not to do set ops /
1215	return (NULL);
1216	}
1217
1218	/*
1219	- freeset - free a now-unused set
1220	*/
1221	static void freeset(struct parse p, cset cs) {
1222	size_t i;
1223	cset *top = &p->g->sets[p->g->ncsets];
1224	size_t css = (size_t)p->g->csetsize;
1225
1226	for (i = `0`; i < css; i++)
1227	CHsub(cs, i);
1228	if (cs == top - `1`) / recover only the easy case /
1229	p->g->ncsets--;
1230	}
1231
1232	/*
1233	- freezeset - final processing on a set of characters
1234	*
1235	* The main task here is merging identical sets. This is usually a waste
1236	* of time (although the hash code minimizes the overhead), but can win
1237	* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1238	* is done using addition rather than xor -- all ASCII [aA] sets xor to
1239	* the same value!
1240	*/
1241	static int / set number /
1242	freezeset(struct parse p, cset cs) {
1243	uch h = cs->hash;
1244	size_t i;
1245	cset *top = &p->g->sets[p->g->ncsets];
1246	cset *cs2;
1247	size_t css = (size_t)p->g->csetsize;
1248
1249	/ look for an earlier one which is the same /
1250	for (cs2 = &p->g->sets[`0`]; cs2 < top; cs2++)
1251	if (cs2->hash == h && cs2 != cs) {
1252	/ maybe /
1253	for (i = `0`; i < css; i++)
1254	if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1255	break; / no /
1256	if (i == css)
1257	break; / yes /
1258	}
1259
1260	if (cs2 < top) { / found one /
1261	freeset(p, cs);
1262	cs = cs2;
1263	}
1264
1265	return ((int)(cs - p->g->sets));
1266	}
1267
1268	/*
1269	- firstch - return first character in a set (which must have at least one)
1270	*/
1271	static int / character; there is no "none" value /
1272	firstch(struct parse p, cset cs) {
1273	size_t i;
1274	size_t css = (size_t)p->g->csetsize;
1275
1276	for (i = `0`; i < css; i++)
1277	if (CHIN(cs, i))
1278	return ((char)i);
1279	assert(never);
1280	return (`0`); / arbitrary /
1281	}
1282
1283	/*
1284	- nch - number of characters in a set
1285	*/
1286	static int nch(struct parse p, cset cs) {
1287	size_t i;
1288	size_t css = (size_t)p->g->csetsize;
1289	int n = `0`;
1290
1291	for (i = `0`; i < css; i++)
1292	if (CHIN(cs, i))
1293	n++;
1294	return (n);
1295	}
1296
1297	/*
1298	- mcadd - add a collating element to a cset
1299	*/
1300	static void mcadd(struct parse p, cset cs, const char *cp) {
1301	size_t oldend = cs->smultis;
1302	void *np;
1303
1304	cs->smultis += strlen(s: cp) + `1`;
1305	np = realloc(ptr: cs->multis, size: cs->smultis);
1306	if (np == NULL) {
1307	if (cs->multis)
1308	free(ptr: cs->multis);
1309	cs->multis = NULL;
1310	SETERROR(REG_ESPACE);
1311	return;
1312	}
1313	cs->multis = np;
1314
1315	llvm_strlcpy(dst: cs->multis + oldend - `1`, src: cp, siz: cs->smultis - oldend + `1`);
1316	}
1317
1318	/*
1319	- mcinvert - invert the list of collating elements in a cset
1320	*
1321	* This would have to know the set of possibilities. Implementation
1322	* is deferred.
1323	*/
1324	/ ARGSUSED /
1325	static void mcinvert(struct parse p, cset cs) {
1326	assert(cs->multis == NULL); / xxx /
1327	}
1328
1329	/*
1330	- mccase - add case counterparts of the list of collating elements in a cset
1331	*
1332	* This would have to know the set of possibilities. Implementation
1333	* is deferred.
1334	*/
1335	/ ARGSUSED /
1336	static void mccase(struct parse p, cset cs) {
1337	assert(cs->multis == NULL); / xxx /
1338	}
1339
1340	/*
1341	- isinsets - is this character in any sets?
1342	*/
1343	static int / predicate /
1344	isinsets(struct re_guts g, int* c) {
1345	uch *col;
1346	int i;
1347	int ncols = (g->ncsets + (CHAR_BIT - `1`)) / CHAR_BIT;
1348	unsigned uc = (uch)c;
1349
1350	for (i = `0`, col = g->setbits; i < ncols; i++, col += g->csetsize)
1351	if (col[uc] != `0`)
1352	return (`1`);
1353	return (`0`);
1354	}
1355
1356	/*
1357	- samesets - are these two characters in exactly the same sets?
1358	*/
1359	static int / predicate /
1360	samesets(struct re_guts g, int* c1, int c2) {
1361	uch *col;
1362	int i;
1363	int ncols = (g->ncsets + (CHAR_BIT - `1`)) / CHAR_BIT;
1364	unsigned uc1 = (uch)c1;
1365	unsigned uc2 = (uch)c2;
1366
1367	for (i = `0`, col = g->setbits; i < ncols; i++, col += g->csetsize)
1368	if (col[uc1] != col[uc2])
1369	return (`0`);
1370	return (`1`);
1371	}
1372
1373	/*
1374	- categorize - sort out character categories
1375	*/
1376	static void categorize(struct parse p, struct* re_guts *g) {
1377	cat_t *cats = g->categories;
1378	int c;
1379	int c2;
1380	cat_t cat;
1381
1382	/ avoid making error situations worse /
1383	if (p->error != `0`)
1384	return;
1385
1386	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1387	if (cats[c] == `0` && isinsets(g, c)) {
1388	cat = g->ncategories++;
1389	cats[c] = cat;
1390	for (c2 = c + `1`; c2 <= CHAR_MAX; c2++)
1391	if (cats[c2] == `0` && samesets(g, c1: c, c2))
1392	cats[c2] = cat;
1393	}
1394	}
1395
1396	/*
1397	- dupl - emit a duplicate of a bunch of sops
1398	*/
1399	static sopno / start of duplicate /
1400	dupl(struct parse p, sopno start, /* from here /
1401	sopno finish) / to this less one /
1402	{
1403	sopno ret = HERE();
1404	sopno len = finish - start;
1405
1406	assert(finish >= start);
1407	if (len == `0`)
1408	return (ret);
1409	enlarge(p, p->ssize + len); / this many unexpected additions /
1410	assert(p->ssize >= p->slen + len);
1411	(void)memmove(dest: (char )(p->strip + p->slen), src: (char* *)(p->strip + start),
1412	n: (size_t)len * sizeof(sop));
1413	p->slen += len;
1414	return (ret);
1415	}
1416
1417	/*
1418	- doemit - emit a strip operator
1419	*
1420	* It might seem better to implement this as a macro with a function as
1421	* hard-case backup, but it's just too big and messy unless there are
1422	* some changes to the data structures. Maybe later.
1423	*/
1424	static void doemit(struct parse *p, sop op, size_t opnd) {
1425	/ avoid making error situations worse /
1426	if (p->error != `0`)
1427	return;
1428
1429	/ deal with oversize operands ("can't happen", more or less) /
1430	assert(opnd < `1` << OPSHIFT);
1431
1432	/ deal with undersized strip /
1433	if (p->slen >= p->ssize)
1434	enlarge(p, (p->ssize + `1`) / `2` * `3`); / +50% /
1435	assert(p->slen < p->ssize);
1436
1437	/ finally, it's all reduced to the easy case /
1438	p->strip[p->slen++] = SOP(op, opnd);
1439	}
1440
1441	/*
1442	- doinsert - insert a sop into the strip
1443	*/
1444	static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos) {
1445	sopno sn;
1446	sop s;
1447	int i;
1448
1449	/ avoid making error situations worse /
1450	if (p->error != `0`)
1451	return;
1452
1453	sn = HERE();
1454	EMIT(op, opnd); / do checks, ensure space /
1455	assert(HERE() == sn + `1`);
1456	s = p->strip[sn];
1457
1458	/ adjust paren pointers /
1459	assert(pos > `0`);
1460	for (i = `1`; i < NPAREN; i++) {
1461	if (p->pbegin[i] >= pos) {
1462	p->pbegin[i]++;
1463	}
1464	if (p->pend[i] >= pos) {
1465	p->pend[i]++;
1466	}
1467	}
1468
1469	memmove(dest: (char )&p->strip[pos + `1`], src: (char* *)&p->strip[pos],
1470	n: (HERE() - pos - `1`) * sizeof(sop));
1471	p->strip[pos] = s;
1472	}
1473
1474	/*
1475	- dofwd - complete a forward reference
1476	*/
1477	static void dofwd(struct parse *p, sopno pos, sop value) {
1478	/ avoid making error situations worse /
1479	if (p->error != `0`)
1480	return;
1481
1482	assert(value < `1` << OPSHIFT);
1483	p->strip[pos] = OP(p->strip[pos]) \| value;
1484	}
1485
1486	/*
1487	- enlarge - enlarge the strip
1488	*/
1489	static void enlarge(struct parse *p, sopno size) {
1490	sop *sp;
1491
1492	if (p->ssize >= size)
1493	return;
1494
1495	if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1496	SETERROR(REG_ESPACE);
1497	return;
1498	}
1499
1500	sp = (sop )realloc(ptr: p->strip, size: size sizeof(sop));
1501	if (sp == NULL) {
1502	SETERROR(REG_ESPACE);
1503	return;
1504	}
1505	p->strip = sp;
1506	p->ssize = size;
1507	}
1508
1509	/*
1510	- stripsnug - compact the strip
1511	*/
1512	static void stripsnug(struct parse p, struct* re_guts *g) {
1513	g->nstates = p->slen;
1514	if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1515	g->strip = p->strip;
1516	SETERROR(REG_ESPACE);
1517	return;
1518	}
1519
1520	g->strip = (sop )realloc(ptr: (char* )p->strip, size: p->slen sizeof(sop));
1521	if (g->strip == NULL) {
1522	SETERROR(REG_ESPACE);
1523	g->strip = p->strip;
1524	}
1525	}
1526
1527	/*
1528	- findmust - fill in must and mlen with longest mandatory literal string
1529	*
1530	* This algorithm could do fancy things like analyzing the operands of \|
1531	* for common subsequences. Someday. This code is simple and finds most
1532	* of the interesting cases.
1533	*
1534	* Note that must and mlen got initialized during setup.
1535	*/
1536	static void findmust(struct parse p, struct* re_guts *g) {
1537	sop *scan;
1538	sop start = `0`; /* start initialized in the default case, after that /
1539	sop newstart = `0`; /* newstart was initialized in the OCHAR case /
1540	sopno newlen;
1541	sop s;
1542	char *cp;
1543	sopno i;
1544
1545	/ avoid making error situations worse /
1546	if (p->error != `0`)
1547	return;
1548
1549	/ find the longest OCHAR sequence in strip /
1550	newlen = `0`;
1551	scan = g->strip + `1`;
1552	do {
1553	s = *scan++;
1554	switch (OP(s)) {
1555	case OCHAR: / sequence member /
1556	if (newlen == `0`) / new sequence /
1557	newstart = scan - `1`;
1558	newlen++;
1559	break;
1560	case OPLUS_: / things that don't break one /
1561	case OLPAREN:
1562	case ORPAREN:
1563	break;
1564	case OQUEST_: / things that must be skipped /
1565	case OCH_:
1566	scan--;
1567	do {
1568	scan += OPND(s);
1569	s = *scan;
1570	/ assert() interferes w debug printouts /
1571	if (OP(s) != O_QUEST && OP(s) != O_CH && OP(s) != OOR2) {
1572	g->iflags \|= REGEX_BAD;
1573	return;
1574	}
1575	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1576	LLVM_FALLTHROUGH;
1577	default: / things that break a sequence /
1578	if (newlen > g->mlen) { / ends one /
1579	start = newstart;
1580	g->mlen = newlen;
1581	}
1582	newlen = `0`;
1583	break;
1584	}
1585	} while (OP(s) != OEND);
1586
1587	if (g->mlen == `0`) / there isn't one /
1588	return;
1589
1590	/ turn it into a character string /
1591	g->must = malloc(size: (size_t)g->mlen + `1`);
1592	if (g->must == NULL) { / argh; just forget it /
1593	g->mlen = `0`;
1594	return;
1595	}
1596	cp = g->must;
1597	scan = start;
1598	for (i = g->mlen; i > `0`; i--) {
1599	while (OP(s = *scan++) != OCHAR)
1600	continue;
1601	assert(cp < g->must + g->mlen);
1602	cp++ = (char*)OPND(s);
1603	}
1604	assert(cp == g->must + g->mlen);
1605	cp++ = `'\0'`; /* just on general principles /
1606	}
1607
1608	/*
1609	- pluscount - count + nesting
1610	*/
1611	static sopno / nesting depth /
1612	pluscount(struct parse p, struct* re_guts *g) {
1613	sop *scan;
1614	sop s;
1615	sopno plusnest = `0`;
1616	sopno maxnest = `0`;
1617
1618	if (p->error != `0`)
1619	return (`0`); / there may not be an OEND /
1620
1621	scan = g->strip + `1`;
1622	do {
1623	s = *scan++;
1624	switch (OP(s)) {
1625	case OPLUS_:
1626	plusnest++;
1627	break;
1628	case O_PLUS:
1629	if (plusnest > maxnest)
1630	maxnest = plusnest;
1631	plusnest--;
1632	break;
1633	}
1634	} while (OP(s) != OEND);
1635	if (plusnest != `0`)
1636	g->iflags \|= REGEX_BAD;
1637	return (maxnest);
1638	}
1639

Browse the source code of llvm_projects/llvm/lib/Support/regcomp.c