regcomp.c source code [llvm_projects/llvm/lib/Support/regcomp.c]

1	/-*
2	* This code is derived from OpenBSD's libc/regex, original license follows:
3	*
4	* Copyright (c) 1992, 1993, 1994 Henry Spencer.
5	* Copyright (c) 1992, 1993, 1994
6	* The Regents of the University of California. All rights reserved.
7	*
8	* This code is derived from software contributed to Berkeley by
9	* Henry Spencer.
10	*
11	* Redistribution and use in source and binary forms, with or without
12	* modification, are permitted provided that the following conditions
13	* are met:
14	* 1. Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* 2. Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* 3. Neither the name of the University nor the names of its contributors
20	* may be used to endorse or promote products derived from this software
21	* without specific prior written permission.
22	*
23	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33	* SUCH DAMAGE.
34	*
35	* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
36	*/
37
38	#include <sys/types.h>
39	#include <stdint.h>
40	#include <stdio.h>
41	#include <string.h>
42	#include <ctype.h>
43	#include <limits.h>
44	#include <stdlib.h>
45	#include "regex_impl.h"
46
47	#include "regutils.h"
48	#include "regex2.h"
49
50	#include "llvm/Config/config.h"
51	#include "llvm/Support/Compiler.h"
52
53	/ character-class table /
54	static struct cclass {
55	const char *name;
56	const char *chars;
57	const char *multis;
58	} cclasses[] = {
59	{ "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
60	0123456789", ""} ,
61	{ "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
62	""} ,
63	{ "blank", " \t", ""} ,
64	{ "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
65	\25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
66	{ "digit", "0123456789", ""} ,
67	{ "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
68	0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~",
69	""} ,
70	{ "lower", "abcdefghijklmnopqrstuvwxyz",
71	""} ,
72	{ "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
73	0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~ ",
74	""} ,
75	{ "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~",
76	""} ,
77	{ "space", "\t\n\v\f\r ", ""} ,
78	{ "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
79	""} ,
80	{ "xdigit", "0123456789ABCDEFabcdef",
81	""} ,
82	{ NULL, `0`, "" }
83	};
84
85	/ character-name table /
86	static struct cname {
87	const char *name;
88	char code;
89	} cnames[] = {
90	{ "NUL", `'\0'` },
91	{ "SOH", `'\001'` },
92	{ "STX", `'\002'` },
93	{ "ETX", `'\003'` },
94	{ "EOT", `'\004'` },
95	{ "ENQ", `'\005'` },
96	{ "ACK", `'\006'` },
97	{ "BEL", `'\007'` },
98	{ "alert", `'\007'` },
99	{ "BS", `'\010'` },
100	{ "backspace", `'\b'` },
101	{ "HT", `'\011'` },
102	{ "tab", `'\t'` },
103	{ "LF", `'\012'` },
104	{ "newline", `'\n'` },
105	{ "VT", `'\013'` },
106	{ "vertical-tab", `'\v'` },
107	{ "FF", `'\014'` },
108	{ "form-feed", `'\f'` },
109	{ "CR", `'\015'` },
110	{ "carriage-return", `'\r'` },
111	{ "SO", `'\016'` },
112	{ "SI", `'\017'` },
113	{ "DLE", `'\020'` },
114	{ "DC1", `'\021'` },
115	{ "DC2", `'\022'` },
116	{ "DC3", `'\023'` },
117	{ "DC4", `'\024'` },
118	{ "NAK", `'\025'` },
119	{ "SYN", `'\026'` },
120	{ "ETB", `'\027'` },
121	{ "CAN", `'\030'` },
122	{ "EM", `'\031'` },
123	{ "SUB", `'\032'` },
124	{ "ESC", `'\033'` },
125	{ "IS4", `'\034'` },
126	{ "FS", `'\034'` },
127	{ "IS3", `'\035'` },
128	{ "GS", `'\035'` },
129	{ "IS2", `'\036'` },
130	{ "RS", `'\036'` },
131	{ "IS1", `'\037'` },
132	{ "US", `'\037'` },
133	{ "space", `' '` },
134	{ "exclamation-mark", `'!'` },
135	{ "quotation-mark", `'"'` },
136	{ "number-sign", `'#'` },
137	{ "dollar-sign", `'$'` },
138	{ "percent-sign", `'%'` },
139	{ "ampersand", `'&'` },
140	{ "apostrophe", `'\''` },
141	{ "left-parenthesis", `'('` },
142	{ "right-parenthesis", `')'` },
143	{ "asterisk", `'*'` },
144	{ "plus-sign", `'+'` },
145	{ "comma", `','` },
146	{ "hyphen", `'-'` },
147	{ "hyphen-minus", `'-'` },
148	{ "period", `'.'` },
149	{ "full-stop", `'.'` },
150	{ "slash", `'/'` },
151	{ "solidus", `'/'` },
152	{ "zero", `'0'` },
153	{ "one", `'1'` },
154	{ "two", `'2'` },
155	{ "three", `'3'` },
156	{ "four", `'4'` },
157	{ "five", `'5'` },
158	{ "six", `'6'` },
159	{ "seven", `'7'` },
160	{ "eight", `'8'` },
161	{ "nine", `'9'` },
162	{ "colon", `':'` },
163	{ "semicolon", `';'` },
164	{ "less-than-sign", `'<'` },
165	{ "equals-sign", `'='` },
166	{ "greater-than-sign", `'>'` },
167	{ "question-mark", `'?'` },
168	{ "commercial-at", `'@'` },
169	{ "left-square-bracket", `'['` },
170	{ "backslash", `'\\'` },
171	{ "reverse-solidus", `'\\'` },
172	{ "right-square-bracket", `']'` },
173	{ "circumflex", `'^'` },
174	{ "circumflex-accent", `'^'` },
175	{ "underscore", `'_'` },
176	{ "low-line", `'_'` },
177	{ "grave-accent", '`' },
178	{ "left-brace", `'{'` },
179	{ "left-curly-bracket", `'{'` },
180	{ "vertical-line", `'\|'` },
181	{ "right-brace", `'}'` },
182	{ "right-curly-bracket", `'}'` },
183	{ "tilde", `'~'` },
184	{ "DEL", `'\177'` },
185	{ NULL, `0` }
186	};
187
188	/*
189	* parse structure, passed up and down to avoid global variables and
190	* other clumsinesses
191	*/
192	struct parse {
193	const char next; /* next character in RE /
194	const char end; /* end of string (-> NUL normally) /
195	int error; / has an error been seen? /
196	sop strip; /* malloced strip /
197	sopno ssize; / malloced strip size (allocated) /
198	sopno slen; / malloced strip length (used) /
199	int ncsalloc; / number of csets allocated /
200	struct re_guts *g;
201	# define NPAREN 10 /* we need to remember () 1-9 for back refs */
202	sopno pbegin[NPAREN]; / -> ( ([0] unused) /
203	sopno pend[NPAREN]; / -> ) ([0] unused) /
204	};
205
206	static void p_ere(struct parse , int*);
207	static void p_ere_exp(struct parse *);
208	static void p_str(struct parse *);
209	static void p_bre(struct parse , int, int*);
210	static int p_simp_re(struct parse , int*);
211	static int p_count(struct parse *);
212	static void p_bracket(struct parse *);
213	static void p_b_term(struct parse , cset );
214	static void p_b_cclass(struct parse , cset );
215	static void p_b_eclass(struct parse , cset );
216	static char p_b_symbol(struct parse *);
217	static char p_b_coll_elem(struct parse , int*);
218	static char othercase(int);
219	static void bothcases(struct parse , int*);
220	static void ordinary(struct parse , int*);
221	static void nonnewline(struct parse *);
222	static void repeat(struct parse , sopno, int, int*);
223	static int seterr(struct parse , int*);
224	static cset allocset(struct* parse *);
225	static void freeset(struct parse , cset );
226	static int freezeset(struct parse , cset );
227	static int firstch(struct parse , cset );
228	static int nch(struct parse , cset );
229	static void mcadd(struct parse , cset , const char *);
230	static void mcinvert(struct parse , cset );
231	static void mccase(struct parse , cset );
232	static int isinsets(struct re_guts , int*);
233	static int samesets(struct re_guts , int, int*);
234	static void categorize(struct parse , struct* re_guts *);
235	static sopno dupl(struct parse *, sopno, sopno);
236	static void doemit(struct parse *, sop, size_t);
237	static void doinsert(struct parse *, sop, size_t, sopno);
238	static void dofwd(struct parse *, sopno, sop);
239	static void enlarge(struct parse *, sopno);
240	static void stripsnug(struct parse , struct* re_guts *);
241	static void findmust(struct parse , struct* re_guts *);
242	static sopno pluscount(struct parse , struct* re_guts *);
243
244	static char nuls[`10`]; / place to point scanner in event of error /
245
246	/*
247	* macros for use with parse structure
248	* BEWARE: these know that the parse structure is named `p' !!!
249	*/
250	#define PEEK() (*p->next)
251	#define PEEK2() (*(p->next+1))
252	#define MORE() (p->end - p->next > 0)
253	#define MORE2() (p->end - p->next > 1)
254	#define SEE(c) (MORE() && PEEK() == (c))
255	#define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b))
256	#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
257	#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
258	#define NEXT() (p->next++)
259	#define NEXT2() (p->next += 2)
260	#define NEXTn(n) (p->next += (n))
261	#define GETNEXT() (*p->next++)
262	#define SETERROR(e) seterr(p, (e))
263	#define REQUIRE(co, e) (void)((co) \|\| SETERROR(e))
264	#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
265	#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
266	#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() \|\| PEEK() != (c), e))
267	#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
268	#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
269	#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
270	#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
271	#define HERE() (p->slen)
272	#define THERE() (p->slen - 1)
273	#define THERETHERE() (p->slen - 2)
274	#define DROP(n) (p->slen -= (n))
275
276	#ifdef _POSIX2_RE_DUP_MAX
277	#define DUPMAX _POSIX2_RE_DUP_MAX
278	#else
279	#define DUPMAX 255
280	#endif
281	#define REGINFINITY (DUPMAX + 1)
282
283	#ifndef NDEBUG
284	static int never = `0`; / for use in asserts; shuts lint up /
285	#else
286	#define never 0 /* some <assert.h>s have bugs too */
287	#endif
288
289	/*
290	- llvm_regcomp - interface for parser and compilation
291	*/
292	int / 0 success, otherwise REG_something /
293	llvm_regcomp(llvm_regex_t preg, const* char pattern, int* cflags)
294	{
295	struct parse pa;
296	struct re_guts *g;
297	struct parse *p = &pa;
298	int i;
299	size_t len;
300	#ifdef REDEBUG
301	# define GOODFLAGS(f) (f)
302	#else
303	# define GOODFLAGS(f) ((f)&~REG_DUMP)
304	#endif
305
306	cflags = GOODFLAGS(cflags);
307	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
308	return(REG_INVARG);
309
310	if (cflags&REG_PEND) {
311	if (preg->re_endp < pattern)
312	return(REG_INVARG);
313	len = preg->re_endp - pattern;
314	} else
315	len = strlen(s: (const char *)pattern);
316
317	/ do the mallocs early so failure handling is easy /
318	g = (struct re_guts )malloc(size: sizeof(struct* re_guts) +
319	(NC-`1`)*sizeof(cat_t));
320	if (g == NULL)
321	return(REG_ESPACE);
322	p->ssize = len/(size_t)`2`(size_t)`3` + (size_t)`1`; /* ugh /
323	p->strip = (sop )calloc(nmemb: p->ssize, size: sizeof*(sop));
324	p->slen = `0`;
325	if (p->strip == NULL) {
326	free(ptr: (char *)g);
327	return(REG_ESPACE);
328	}
329
330	/ set things up /
331	p->g = g;
332	p->next = pattern;
333	p->end = p->next + len;
334	p->error = `0`;
335	p->ncsalloc = `0`;
336	for (i = `0`; i < NPAREN; i++) {
337	p->pbegin[i] = `0`;
338	p->pend[i] = `0`;
339	}
340	g->csetsize = NC;
341	g->sets = NULL;
342	g->setbits = NULL;
343	g->ncsets = `0`;
344	g->cflags = cflags;
345	g->iflags = `0`;
346	g->nbol = `0`;
347	g->neol = `0`;
348	g->must = NULL;
349	g->mlen = `0`;
350	g->nsub = `0`;
351	g->ncategories = `1`; / category 0 is "everything else" /
352	g->categories = &g->catspace[-(CHAR_MIN)];
353	(void) memset(s: (char )g->catspace, c: `0`, NCsizeof(cat_t));
354	g->backrefs = `0`;
355
356	/ do it /
357	EMIT(OEND, `0`);
358	g->firststate = THERE();
359	if (cflags&REG_EXTENDED)
360	p_ere(p, OUT);
361	else if (cflags&REG_NOSPEC)
362	p_str(p);
363	else
364	p_bre(p, OUT, OUT);
365	EMIT(OEND, `0`);
366	g->laststate = THERE();
367
368	/ tidy up loose ends and fill things in /
369	categorize(p, g);
370	stripsnug(p, g);
371	findmust(p, g);
372	g->nplus = pluscount(p, g);
373	g->magic = MAGIC2;
374	preg->re_nsub = g->nsub;
375	preg->re_g = g;
376	preg->re_magic = MAGIC1;
377	#ifndef REDEBUG
378	/ not debugging, so can't rely on the assert() in llvm_regexec() /
379	if (g->iflags&REGEX_BAD)
380	SETERROR(REG_ASSERT);
381	#endif
382
383	/ win or lose, we're done /
384	if (p->error != `0`) / lose /
385	llvm_regfree(preg);
386	return(p->error);
387	}
388
389	/*
390	- p_ere - ERE parser top level, concatenation and alternation
391	*/
392	static void
393	p_ere(struct parse p, int* stop) / character this ERE should end at /
394	{
395	char c;
396	sopno prevback = `0`;
397	sopno prevfwd = `0`;
398	sopno conc;
399	int first = `1`; / is this the first alternative? /
400
401	for (;;) {
402	/ do a bunch of concatenated expressions /
403	conc = HERE();
404	while (MORE() && (c = PEEK()) != `'\|'` && c != stop)
405	p_ere_exp(p);
406	REQUIRE(HERE() != conc, REG_EMPTY); / require nonempty /
407
408	if (!EAT(`'\|'`))
409	break; / NOTE BREAK OUT /
410
411	if (first) {
412	INSERT(OCH_, conc); / offset is wrong /
413	prevfwd = conc;
414	prevback = conc;
415	first = `0`;
416	}
417	ASTERN(OOR1, prevback);
418	prevback = THERE();
419	AHEAD(prevfwd); / fix previous offset /
420	prevfwd = HERE();
421	EMIT(OOR2, `0`); / offset is very wrong /
422	}
423
424	if (!first) { / tail-end fixups /
425	AHEAD(prevfwd);
426	ASTERN(O_CH, prevback);
427	}
428
429	assert(!MORE() \|\| SEE(stop));
430	}
431
432	/*
433	- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
434	*/
435	static void
436	p_ere_exp(struct parse *p)
437	{
438	char c;
439	sopno pos;
440	int count;
441	int count2;
442	int backrefnum;
443	sopno subno;
444	int wascaret = `0`;
445
446	assert(MORE()); / caller should have ensured this /
447	c = GETNEXT();
448
449	pos = HERE();
450	switch (c) {
451	case `'('`:
452	REQUIRE(MORE(), REG_EPAREN);
453	p->g->nsub++;
454	subno = p->g->nsub;
455	if (subno < NPAREN)
456	p->pbegin[subno] = HERE();
457	EMIT(OLPAREN, subno);
458	if (!SEE(`')'`))
459	p_ere(p, stop: `')'`);
460	if (subno < NPAREN) {
461	p->pend[subno] = HERE();
462	assert(p->pend[subno] != `0`);
463	}
464	EMIT(ORPAREN, subno);
465	MUSTEAT(`')'`, REG_EPAREN);
466	break;
467	#ifndef POSIX_MISTAKE
468	case `')'`: / happens only if no current unmatched ( /
469	/*
470	* You may ask, why the ifndef? Because I didn't notice
471	* this until slightly too late for 1003.2, and none of the
472	* other 1003.2 regular-expression reviewers noticed it at
473	* all. So an unmatched ) is legal POSIX, at least until
474	* we can get it fixed.
475	*/
476	SETERROR(REG_EPAREN);
477	break;
478	#endif
479	case `'^'`:
480	EMIT(OBOL, `0`);
481	p->g->iflags \|= USEBOL;
482	p->g->nbol++;
483	wascaret = `1`;
484	break;
485	case `'$'`:
486	EMIT(OEOL, `0`);
487	p->g->iflags \|= USEEOL;
488	p->g->neol++;
489	break;
490	case `'\|'`:
491	SETERROR(REG_EMPTY);
492	break;
493	case `'*'`:
494	case `'+'`:
495	case `'?'`:
496	SETERROR(REG_BADRPT);
497	break;
498	case `'.'`:
499	if (p->g->cflags&REG_NEWLINE)
500	nonnewline(p);
501	else
502	EMIT(OANY, `0`);
503	break;
504	case `'['`:
505	p_bracket(p);
506	break;
507	case `'\\'`:
508	REQUIRE(MORE(), REG_EESCAPE);
509	c = GETNEXT();
510	if (c >= `'1'` && c <= `'9'`) {
511	/ \[0-9] is taken to be a back-reference to a previously specified*
512	* matching group. backrefnum will hold the number. The matching
513	* group must exist (i.e. if \4 is found there must have been at
514	* least 4 matching groups specified in the pattern previously).
515	*/
516	backrefnum = c - `'0'`;
517	if (p->pend[backrefnum] == `0`) {
518	SETERROR(REG_ESUBREG);
519	break;
520	}
521
522	/ Make sure everything checks out and emit the sequence*
523	* that marks a back-reference to the parse structure.
524	*/
525	assert(backrefnum <= p->g->nsub);
526	EMIT(OBACK_, backrefnum);
527	assert(p->pbegin[backrefnum] != `0`);
528	assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
529	assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
530	(void) dupl(p, p->pbegin[backrefnum]+`1`, p->pend[backrefnum]);
531	EMIT(O_BACK, backrefnum);
532	p->g->backrefs = `1`;
533	} else {
534	/ Other chars are simply themselves when escaped with a backslash.*
535	*/
536	ordinary(p, c);
537	}
538	break;
539	case `'{'`: / okay as ordinary except if digit follows /
540	REQUIRE(!MORE() \|\| !isdigit((uch)PEEK()), REG_BADRPT);
541	LLVM_FALLTHROUGH;
542	default:
543	ordinary(p, c);
544	break;
545	}
546
547	if (!MORE())
548	return;
549	c = PEEK();
550	/ we call { a repetition if followed by a digit /
551	if (!( c == `'*'` \|\| c == `'+'` \|\| c == `'?'` \|\|
552	(c == `'{'` && MORE2() && isdigit((uch)PEEK2())) ))
553	return; / no repetition, we're done /
554	NEXT();
555
556	REQUIRE(!wascaret, REG_BADRPT);
557	switch (c) {
558	case `''`: /* implemented as +? /
559	/ this case does not require the (y\|) trick, noKLUDGE /
560	INSERT(OPLUS_, pos);
561	ASTERN(O_PLUS, pos);
562	INSERT(OQUEST_, pos);
563	ASTERN(O_QUEST, pos);
564	break;
565	case `'+'`:
566	INSERT(OPLUS_, pos);
567	ASTERN(O_PLUS, pos);
568	break;
569	case `'?'`:
570	/ KLUDGE: emit y? as (y\|) until subtle bug gets fixed /
571	INSERT(OCH_, pos); / offset slightly wrong /
572	ASTERN(OOR1, pos); / this one's right /
573	AHEAD(pos); / fix the OCH_ /
574	EMIT(OOR2, `0`); / offset very wrong... /
575	AHEAD(THERE()); / ...so fix it /
576	ASTERN(O_CH, THERETHERE());
577	break;
578	case `'{'`:
579	count = p_count(p);
580	if (EAT(`','`)) {
581	if (isdigit((uch)PEEK())) {
582	count2 = p_count(p);
583	REQUIRE(count <= count2, REG_BADBR);
584	} else / single number with comma /
585	count2 = REGINFINITY;
586	} else / just a single number /
587	count2 = count;
588	repeat(p, pos, count, count2);
589	if (!EAT(`'}'`)) { / error heuristics /
590	while (MORE() && PEEK() != `'}'`)
591	NEXT();
592	REQUIRE(MORE(), REG_EBRACE);
593	SETERROR(REG_BADBR);
594	}
595	break;
596	}
597
598	if (!MORE())
599	return;
600	c = PEEK();
601	if (!( c == `'*'` \|\| c == `'+'` \|\| c == `'?'` \|\|
602	(c == `'{'` && MORE2() && isdigit((uch)PEEK2())) ) )
603	return;
604	SETERROR(REG_BADRPT);
605	}
606
607	/*
608	- p_str - string (no metacharacters) "parser"
609	*/
610	static void
611	p_str(struct parse *p)
612	{
613	REQUIRE(MORE(), REG_EMPTY);
614	while (MORE())
615	ordinary(p, GETNEXT());
616	}
617
618	/*
619	- p_bre - BRE parser top level, anchoring and concatenation
620	* Giving end1 as OUT essentially eliminates the end1/end2 check.
621	*
622	* This implementation is a bit of a kludge, in that a trailing $ is first
623	* taken as an ordinary character and then revised to be an anchor. The
624	* only undesirable side effect is that '$' gets included as a character
625	* category in such cases. This is fairly harmless; not worth fixing.
626	* The amount of lookahead needed to avoid this kludge is excessive.
627	*/
628	static void
629	p_bre(struct parse *p,
630	int end1, / first terminating character /
631	int end2) / second terminating character /
632	{
633	sopno start = HERE();
634	int first = `1`; / first subexpression? /
635	int wasdollar = `0`;
636
637	if (EAT(`'^'`)) {
638	EMIT(OBOL, `0`);
639	p->g->iflags \|= USEBOL;
640	p->g->nbol++;
641	}
642	while (MORE() && !SEETWO(end1, end2)) {
643	wasdollar = p_simp_re(p, first);
644	first = `0`;
645	}
646	if (wasdollar) { / oops, that was a trailing anchor /
647	DROP(`1`);
648	EMIT(OEOL, `0`);
649	p->g->iflags \|= USEEOL;
650	p->g->neol++;
651	}
652
653	REQUIRE(HERE() != start, REG_EMPTY); / require nonempty /
654	}
655
656	/*
657	- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
658	*/
659	static int / was the simple RE an unbackslashed $? /
660	p_simp_re(struct parse *p,
661	int starordinary) / is a leading * an ordinary character? /
662	{
663	int c;
664	int count;
665	int count2;
666	sopno pos;
667	int i;
668	sopno subno;
669	# define BACKSL (1<<CHAR_BIT)
670
671	pos = HERE(); / repetition op, if any, covers from here /
672
673	assert(MORE()); / caller should have ensured this /
674	c = GETNEXT();
675	if (c == `'\\'`) {
676	REQUIRE(MORE(), REG_EESCAPE);
677	c = BACKSL \| GETNEXT();
678	}
679	switch (c) {
680	case `'.'`:
681	if (p->g->cflags&REG_NEWLINE)
682	nonnewline(p);
683	else
684	EMIT(OANY, `0`);
685	break;
686	case `'['`:
687	p_bracket(p);
688	break;
689	case BACKSL\|`'{'`:
690	SETERROR(REG_BADRPT);
691	break;
692	case BACKSL\|`'('`:
693	p->g->nsub++;
694	subno = p->g->nsub;
695	if (subno < NPAREN)
696	p->pbegin[subno] = HERE();
697	EMIT(OLPAREN, subno);
698	/ the MORE here is an error heuristic /
699	if (MORE() && !SEETWO(`'\\'`, `')'`))
700	p_bre(p, end1: `'\\'`, end2: `')'`);
701	if (subno < NPAREN) {
702	p->pend[subno] = HERE();
703	assert(p->pend[subno] != `0`);
704	}
705	EMIT(ORPAREN, subno);
706	REQUIRE(EATTWO(`'\\'`, `')'`), REG_EPAREN);
707	break;
708	case BACKSL\|`')'`: / should not get here -- must be user /
709	case BACKSL\|`'}'`:
710	SETERROR(REG_EPAREN);
711	break;
712	case BACKSL\|`'1'`:
713	case BACKSL\|`'2'`:
714	case BACKSL\|`'3'`:
715	case BACKSL\|`'4'`:
716	case BACKSL\|`'5'`:
717	case BACKSL\|`'6'`:
718	case BACKSL\|`'7'`:
719	case BACKSL\|`'8'`:
720	case BACKSL\|`'9'`:
721	i = (c&~BACKSL) - `'0'`;
722	assert(i < NPAREN);
723	if (p->pend[i] != `0`) {
724	assert(i <= p->g->nsub);
725	EMIT(OBACK_, i);
726	assert(p->pbegin[i] != `0`);
727	assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
728	assert(OP(p->strip[p->pend[i]]) == ORPAREN);
729	(void) dupl(p, p->pbegin[i]+`1`, p->pend[i]);
730	EMIT(O_BACK, i);
731	} else
732	SETERROR(REG_ESUBREG);
733	p->g->backrefs = `1`;
734	break;
735	case `'*'`:
736	REQUIRE(starordinary, REG_BADRPT);
737	LLVM_FALLTHROUGH;
738	default:
739	ordinary(p, (char)c);
740	break;
741	}
742
743	if (EAT(`''`)) { /* implemented as +? /
744	/ this case does not require the (y\|) trick, noKLUDGE /
745	INSERT(OPLUS_, pos);
746	ASTERN(O_PLUS, pos);
747	INSERT(OQUEST_, pos);
748	ASTERN(O_QUEST, pos);
749	} else if (EATTWO(`'\\'`, `'{'`)) {
750	count = p_count(p);
751	if (EAT(`','`)) {
752	if (MORE() && isdigit((uch)PEEK())) {
753	count2 = p_count(p);
754	REQUIRE(count <= count2, REG_BADBR);
755	} else / single number with comma /
756	count2 = REGINFINITY;
757	} else / just a single number /
758	count2 = count;
759	repeat(p, pos, count, count2);
760	if (!EATTWO(`'\\'`, `'}'`)) { / error heuristics /
761	while (MORE() && !SEETWO(`'\\'`, `'}'`))
762	NEXT();
763	REQUIRE(MORE(), REG_EBRACE);
764	SETERROR(REG_BADBR);
765	}
766	} else if (c == `'$'`) / $ (but not \$) ends it /
767	return(`1`);
768
769	return(`0`);
770	}
771
772	/*
773	- p_count - parse a repetition count
774	*/
775	static int / the value /
776	p_count(struct parse *p)
777	{
778	int count = `0`;
779	int ndigits = `0`;
780
781	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
782	count = count*`10` + (GETNEXT() - `'0'`);
783	ndigits++;
784	}
785
786	REQUIRE(ndigits > `0` && count <= DUPMAX, REG_BADBR);
787	return(count);
788	}
789
790	/*
791	- p_bracket - parse a bracketed character list
792	*
793	* Note a significant property of this code: if the allocset() did SETERROR,
794	* no set operations are done.
795	*/
796	static void
797	p_bracket(struct parse *p)
798	{
799	cset *cs;
800	int invert = `0`;
801
802	/ Dept of Truly Sickening Special-Case Kludges /
803	if (p->end - p->next > `5`) {
804	if (strncmp(s1: p->next, s2: "[:<:]]", n: `6`) == `0`) {
805	EMIT(OBOW, `0`);
806	NEXTn(`6`);
807	return;
808	}
809	if (strncmp(s1: p->next, s2: "[:>:]]", n: `6`) == `0`) {
810	EMIT(OEOW, `0`);
811	NEXTn(`6`);
812	return;
813	}
814	}
815
816	if ((cs = allocset(p)) == NULL) {
817	/ allocset did set error status in p /
818	return;
819	}
820
821	if (EAT(`'^'`))
822	invert++; / make note to invert set at end /
823	if (EAT(`']'`))
824	CHadd(cs, `']'`);
825	else if (EAT(`'-'`))
826	CHadd(cs, `'-'`);
827	while (MORE() && PEEK() != `']'` && !SEETWO(`'-'`, `']'`))
828	p_b_term(p, cs);
829	if (EAT(`'-'`))
830	CHadd(cs, `'-'`);
831	MUSTEAT(`']'`, REG_EBRACK);
832
833	if (p->error != `0`) { / don't mess things up further /
834	freeset(p, cs);
835	return;
836	}
837
838	if (p->g->cflags&REG_ICASE) {
839	int i;
840	int ci;
841
842	for (i = p->g->csetsize - `1`; i >= `0`; i--)
843	if (CHIN(cs, i) && isalpha(i)) {
844	ci = othercase(i);
845	if (ci != i)
846	CHadd(cs, ci);
847	}
848	if (cs->multis != NULL)
849	mccase(p, cs);
850	}
851	if (invert) {
852	int i;
853
854	for (i = p->g->csetsize - `1`; i >= `0`; i--)
855	if (CHIN(cs, i))
856	CHsub(cs, i);
857	else
858	CHadd(cs, i);
859	if (p->g->cflags&REG_NEWLINE)
860	CHsub(cs, `'\n'`);
861	if (cs->multis != NULL)
862	mcinvert(p, cs);
863	}
864
865	assert(cs->multis == NULL); / xxx /
866
867	if (nch(p, cs) == `1`) { / optimize singleton sets /
868	ordinary(p, firstch(p, cs));
869	freeset(p, cs);
870	} else
871	EMIT(OANYOF, freezeset(p, cs));
872	}
873
874	/*
875	- p_b_term - parse one term of a bracketed character list
876	*/
877	static void
878	p_b_term(struct parse p, cset cs)
879	{
880	char c;
881	char start, finish;
882	int i;
883
884	/ classify what we've got /
885	switch ((MORE()) ? PEEK() : `'\0'`) {
886	case `'['`:
887	c = (MORE2()) ? PEEK2() : `'\0'`;
888	break;
889	case `'-'`:
890	SETERROR(REG_ERANGE);
891	return; / NOTE RETURN /
892	break;
893	default:
894	c = `'\0'`;
895	break;
896	}
897
898	switch (c) {
899	case `':'`: / character class /
900	NEXT2();
901	REQUIRE(MORE(), REG_EBRACK);
902	c = PEEK();
903	REQUIRE(c != `'-'` && c != `']'`, REG_ECTYPE);
904	p_b_cclass(p, cs);
905	REQUIRE(MORE(), REG_EBRACK);
906	REQUIRE(EATTWO(`':'`, `']'`), REG_ECTYPE);
907	break;
908	case `'='`: / equivalence class /
909	NEXT2();
910	REQUIRE(MORE(), REG_EBRACK);
911	c = PEEK();
912	REQUIRE(c != `'-'` && c != `']'`, REG_ECOLLATE);
913	p_b_eclass(p, cs);
914	REQUIRE(MORE(), REG_EBRACK);
915	REQUIRE(EATTWO(`'='`, `']'`), REG_ECOLLATE);
916	break;
917	default: / symbol, ordinary character, or range /
918	/ xxx revision needed for multichar stuff /
919	start = p_b_symbol(p);
920	if (SEE(`'-'`) && MORE2() && PEEK2() != `']'`) {
921	/ range /
922	NEXT();
923	if (EAT(`'-'`))
924	finish = `'-'`;
925	else
926	finish = p_b_symbol(p);
927	} else
928	finish = start;
929	/ xxx what about signed chars here... /
930	REQUIRE(start <= finish, REG_ERANGE);
931	for (i = start; i <= finish; i++)
932	CHadd(cs, i);
933	break;
934	}
935	}
936
937	/*
938	- p_b_cclass - parse a character-class name and deal with it
939	*/
940	static void
941	p_b_cclass(struct parse p, cset cs)
942	{
943	const char *sp = p->next;
944	struct cclass *cp;
945	size_t len;
946	const char *u;
947	char c;
948
949	while (MORE() && isalpha((uch)PEEK()))
950	NEXT();
951	len = p->next - sp;
952	for (cp = cclasses; cp->name != NULL; cp++)
953	if (strncmp(s1: cp->name, s2: sp, n: len) == `0` && cp->name[len] == `'\0'`)
954	break;
955	if (cp->name == NULL) {
956	/ oops, didn't find it /
957	SETERROR(REG_ECTYPE);
958	return;
959	}
960
961	u = cp->chars;
962	while ((c = *u++) != `'\0'`)
963	CHadd(cs, c);
964	for (u = cp->multis; *u != `'\0'`; u += strlen(s: u) + `1`)
965	MCadd(p, cs, u);
966	}
967
968	/*
969	- p_b_eclass - parse an equivalence-class name and deal with it
970	*
971	* This implementation is incomplete. xxx
972	*/
973	static void
974	p_b_eclass(struct parse p, cset cs)
975	{
976	char c;
977
978	c = p_b_coll_elem(p, `'='`);
979	CHadd(cs, c);
980	}
981
982	/*
983	- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
984	*/
985	static char / value of symbol /
986	p_b_symbol(struct parse *p)
987	{
988	char value;
989
990	REQUIRE(MORE(), REG_EBRACK);
991	if (!EATTWO(`'['`, `'.'`))
992	return(GETNEXT());
993
994	/ collating symbol /
995	value = p_b_coll_elem(p, `'.'`);
996	REQUIRE(EATTWO(`'.'`, `']'`), REG_ECOLLATE);
997	return(value);
998	}
999
1000	/*
1001	- p_b_coll_elem - parse a collating-element name and look it up
1002	*/
1003	static char / value of collating element /
1004	p_b_coll_elem(struct parse *p,
1005	int endc) / name ended by endc,']' /
1006	{
1007	const char *sp = p->next;
1008	struct cname *cp;
1009	size_t len;
1010
1011	while (MORE() && !SEETWO(endc, `']'`))
1012	NEXT();
1013	if (!MORE()) {
1014	SETERROR(REG_EBRACK);
1015	return(`0`);
1016	}
1017	len = p->next - sp;
1018	for (cp = cnames; cp->name != NULL; cp++)
1019	if (strncmp(s1: cp->name, s2: sp, n: len) == `0` && strlen(s: cp->name) == len)
1020	return(cp->code); / known name /
1021	if (len == `1`)
1022	return(sp); /* single character /
1023	SETERROR(REG_ECOLLATE); / neither /
1024	return(`0`);
1025	}
1026
1027	/*
1028	- othercase - return the case counterpart of an alphabetic
1029	*/
1030	static char / if no counterpart, return ch /
1031	othercase(int ch)
1032	{
1033	ch = (uch)ch;
1034	assert(isalpha(ch));
1035	if (isupper(ch))
1036	return ((uch)tolower(ch));
1037	else if (islower(ch))
1038	return ((uch)toupper(ch));
1039	else / peculiar, but could happen /
1040	return(ch);
1041	}
1042
1043	/*
1044	- bothcases - emit a dualcase version of a two-case character
1045	*
1046	* Boy, is this implementation ever a kludge...
1047	*/
1048	static void
1049	bothcases(struct parse p, int* ch)
1050	{
1051	const char *oldnext = p->next;
1052	const char *oldend = p->end;
1053	char bracket[`3`];
1054
1055	ch = (uch)ch;
1056	assert(othercase(ch) != ch); / p_bracket() would recurse /
1057	p->next = bracket;
1058	p->end = bracket+`2`;
1059	bracket[`0`] = ch;
1060	bracket[`1`] = `']'`;
1061	bracket[`2`] = `'\0'`;
1062	p_bracket(p);
1063	assert(p->next == bracket+`2`);
1064	p->next = oldnext;
1065	p->end = oldend;
1066	}
1067
1068	/*
1069	- ordinary - emit an ordinary character
1070	*/
1071	static void
1072	ordinary(struct parse p, int* ch)
1073	{
1074	cat_t *cap = p->g->categories;
1075
1076	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1077	bothcases(p, ch);
1078	else {
1079	EMIT(OCHAR, (uch)ch);
1080	if (cap[ch] == `0`)
1081	cap[ch] = p->g->ncategories++;
1082	}
1083	}
1084
1085	/*
1086	- nonnewline - emit REG_NEWLINE version of OANY
1087	*
1088	* Boy, is this implementation ever a kludge...
1089	*/
1090	static void
1091	nonnewline(struct parse *p)
1092	{
1093	const char *oldnext = p->next;
1094	const char *oldend = p->end;
1095	static const char bracket[`4`] = {`'^'`, `'\n'`, `']'`, `'\0'`};
1096
1097	p->next = bracket;
1098	p->end = bracket+`3`;
1099	p_bracket(p);
1100	assert(p->next == bracket+`3`);
1101	p->next = oldnext;
1102	p->end = oldend;
1103	}
1104
1105	/*
1106	- repeat - generate code for a bounded repetition, recursively if needed
1107	*/
1108	static void
1109	repeat(struct parse *p,
1110	sopno start, / operand from here to end of strip /
1111	int from, / repeated from this number /
1112	int to) / to this number of times (maybe INFINITY) /
1113	{
1114	sopno finish = HERE();
1115	# define N 2
1116	# define INF 3
1117	# define REP(f, t) ((f)*8 + (t))
1118	# define MAP(n) (((n) <= 1) ? (n) : ((n) == REGINFINITY) ? INF : N)
1119	sopno copy;
1120
1121	if (p->error != `0`) / head off possible runaway recursion /
1122	return;
1123
1124	assert(from <= to);
1125
1126	switch (REP(MAP(from), MAP(to))) {
1127	case REP(`0`, `0`): / must be user doing this /
1128	DROP(finish-start); / drop the operand /
1129	break;
1130	case REP(`0`, `1`): / as x{1,1}? /
1131	case REP(`0`, N): / as x{1,n}? /
1132	case REP(`0`, INF): / as x{1,}? /
1133	/ KLUDGE: emit y? as (y\|) until subtle bug gets fixed /
1134	INSERT(OCH_, start); / offset is wrong... /
1135	repeat(p, start: start+`1`, from: `1`, to);
1136	ASTERN(OOR1, start);
1137	AHEAD(start); / ... fix it /
1138	EMIT(OOR2, `0`);
1139	AHEAD(THERE());
1140	ASTERN(O_CH, THERETHERE());
1141	break;
1142	case REP(`1`, `1`): / trivial case /
1143	/ done /
1144	break;
1145	case REP(`1`, N): / as x?x{1,n-1} /
1146	/ KLUDGE: emit y? as (y\|) until subtle bug gets fixed /
1147	INSERT(OCH_, start);
1148	ASTERN(OOR1, start);
1149	AHEAD(start);
1150	EMIT(OOR2, `0`); / offset very wrong... /
1151	AHEAD(THERE()); / ...so fix it /
1152	ASTERN(O_CH, THERETHERE());
1153	copy = dupl(p, start+`1`, finish+`1`);
1154	assert(copy == finish+`4`);
1155	repeat(p, start: copy, from: `1`, to: to-`1`);
1156	break;
1157	case REP(`1`, INF): / as x+ /
1158	INSERT(OPLUS_, start);
1159	ASTERN(O_PLUS, start);
1160	break;
1161	case REP(N, N): / as xx{m-1,n-1} /
1162	copy = dupl(p, start, finish);
1163	repeat(p, start: copy, from: from-`1`, to: to-`1`);
1164	break;
1165	case REP(N, INF): / as xx{n-1,INF} /
1166	copy = dupl(p, start, finish);
1167	repeat(p, start: copy, from: from-`1`, to);
1168	break;
1169	default: / "can't happen" /
1170	SETERROR(REG_ASSERT); / just in case /
1171	break;
1172	}
1173	}
1174
1175	/*
1176	- seterr - set an error condition
1177	*/
1178	static int / useless but makes type checking happy /
1179	seterr(struct parse p, int* e)
1180	{
1181	if (p->error == `0`) / keep earliest error condition /
1182	p->error = e;
1183	p->next = nuls; / try to bring things to a halt /
1184	p->end = nuls;
1185	return(`0`); / make the return value well-defined /
1186	}
1187
1188	/*
1189	- allocset - allocate a set of characters for []
1190	*/
1191	static cset *
1192	allocset(struct parse *p)
1193	{
1194	int no = p->g->ncsets++;
1195	size_t nc;
1196	size_t nbytes;
1197	cset *cs;
1198	size_t css = (size_t)p->g->csetsize;
1199	int i;
1200
1201	if (no >= p->ncsalloc) { / need another column of space /
1202	void *ptr;
1203
1204	p->ncsalloc += CHAR_BIT;
1205	nc = p->ncsalloc;
1206	if (nc > SIZE_MAX / sizeof(cset))
1207	goto nomem;
1208	assert(nc % CHAR_BIT == `0`);
1209	nbytes = nc / CHAR_BIT * css;
1210
1211	ptr = (cset )realloc(ptr: (char* )p->g->sets, size: nc sizeof(cset));
1212	if (ptr == NULL)
1213	goto nomem;
1214	p->g->sets = ptr;
1215
1216	ptr = (uch )realloc(ptr: (char* *)p->g->setbits, size: nbytes);
1217	if (ptr == NULL)
1218	goto nomem;
1219	p->g->setbits = ptr;
1220
1221	for (i = `0`; i < no; i++)
1222	p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1223
1224	(void) memset(s: (char *)p->g->setbits + (nbytes - css), c: `0`, n: css);
1225	}
1226	/ XXX should not happen /
1227	if (p->g->sets == NULL \|\| p->g->setbits == NULL)
1228	goto nomem;
1229
1230	cs = &p->g->sets[no];
1231	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1232	cs->mask = `1` << ((no) % CHAR_BIT);
1233	cs->hash = `0`;
1234	cs->smultis = `0`;
1235	cs->multis = NULL;
1236
1237	return(cs);
1238	nomem:
1239	free(ptr: p->g->sets);
1240	p->g->sets = NULL;
1241	free(ptr: p->g->setbits);
1242	p->g->setbits = NULL;
1243
1244	SETERROR(REG_ESPACE);
1245	/ caller's responsibility not to do set ops /
1246	return(NULL);
1247	}
1248
1249	/*
1250	- freeset - free a now-unused set
1251	*/
1252	static void
1253	freeset(struct parse p, cset cs)
1254	{
1255	size_t i;
1256	cset *top = &p->g->sets[p->g->ncsets];
1257	size_t css = (size_t)p->g->csetsize;
1258
1259	for (i = `0`; i < css; i++)
1260	CHsub(cs, i);
1261	if (cs == top-`1`) / recover only the easy case /
1262	p->g->ncsets--;
1263	}
1264
1265	/*
1266	- freezeset - final processing on a set of characters
1267	*
1268	* The main task here is merging identical sets. This is usually a waste
1269	* of time (although the hash code minimizes the overhead), but can win
1270	* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1271	* is done using addition rather than xor -- all ASCII [aA] sets xor to
1272	* the same value!
1273	*/
1274	static int / set number /
1275	freezeset(struct parse p, cset cs)
1276	{
1277	uch h = cs->hash;
1278	size_t i;
1279	cset *top = &p->g->sets[p->g->ncsets];
1280	cset *cs2;
1281	size_t css = (size_t)p->g->csetsize;
1282
1283	/ look for an earlier one which is the same /
1284	for (cs2 = &p->g->sets[`0`]; cs2 < top; cs2++)
1285	if (cs2->hash == h && cs2 != cs) {
1286	/ maybe /
1287	for (i = `0`; i < css; i++)
1288	if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1289	break; / no /
1290	if (i == css)
1291	break; / yes /
1292	}
1293
1294	if (cs2 < top) { / found one /
1295	freeset(p, cs);
1296	cs = cs2;
1297	}
1298
1299	return((int)(cs - p->g->sets));
1300	}
1301
1302	/*
1303	- firstch - return first character in a set (which must have at least one)
1304	*/
1305	static int / character; there is no "none" value /
1306	firstch(struct parse p, cset cs)
1307	{
1308	size_t i;
1309	size_t css = (size_t)p->g->csetsize;
1310
1311	for (i = `0`; i < css; i++)
1312	if (CHIN(cs, i))
1313	return((char)i);
1314	assert(never);
1315	return(`0`); / arbitrary /
1316	}
1317
1318	/*
1319	- nch - number of characters in a set
1320	*/
1321	static int
1322	nch(struct parse p, cset cs)
1323	{
1324	size_t i;
1325	size_t css = (size_t)p->g->csetsize;
1326	int n = `0`;
1327
1328	for (i = `0`; i < css; i++)
1329	if (CHIN(cs, i))
1330	n++;
1331	return(n);
1332	}
1333
1334	/*
1335	- mcadd - add a collating element to a cset
1336	*/
1337	static void
1338	mcadd( struct parse p, cset cs, const char *cp)
1339	{
1340	size_t oldend = cs->smultis;
1341	void *np;
1342
1343	cs->smultis += strlen(s: cp) + `1`;
1344	np = realloc(ptr: cs->multis, size: cs->smultis);
1345	if (np == NULL) {
1346	if (cs->multis)
1347	free(ptr: cs->multis);
1348	cs->multis = NULL;
1349	SETERROR(REG_ESPACE);
1350	return;
1351	}
1352	cs->multis = np;
1353
1354	llvm_strlcpy(dst: cs->multis + oldend - `1`, src: cp, siz: cs->smultis - oldend + `1`);
1355	}
1356
1357	/*
1358	- mcinvert - invert the list of collating elements in a cset
1359	*
1360	* This would have to know the set of possibilities. Implementation
1361	* is deferred.
1362	*/
1363	/ ARGSUSED /
1364	static void
1365	mcinvert(struct parse p, cset cs)
1366	{
1367	assert(cs->multis == NULL); / xxx /
1368	}
1369
1370	/*
1371	- mccase - add case counterparts of the list of collating elements in a cset
1372	*
1373	* This would have to know the set of possibilities. Implementation
1374	* is deferred.
1375	*/
1376	/ ARGSUSED /
1377	static void
1378	mccase(struct parse p, cset cs)
1379	{
1380	assert(cs->multis == NULL); / xxx /
1381	}
1382
1383	/*
1384	- isinsets - is this character in any sets?
1385	*/
1386	static int / predicate /
1387	isinsets(struct re_guts g, int* c)
1388	{
1389	uch *col;
1390	int i;
1391	int ncols = (g->ncsets+(CHAR_BIT-`1`)) / CHAR_BIT;
1392	unsigned uc = (uch)c;
1393
1394	for (i = `0`, col = g->setbits; i < ncols; i++, col += g->csetsize)
1395	if (col[uc] != `0`)
1396	return(`1`);
1397	return(`0`);
1398	}
1399
1400	/*
1401	- samesets - are these two characters in exactly the same sets?
1402	*/
1403	static int / predicate /
1404	samesets(struct re_guts g, int* c1, int c2)
1405	{
1406	uch *col;
1407	int i;
1408	int ncols = (g->ncsets+(CHAR_BIT-`1`)) / CHAR_BIT;
1409	unsigned uc1 = (uch)c1;
1410	unsigned uc2 = (uch)c2;
1411
1412	for (i = `0`, col = g->setbits; i < ncols; i++, col += g->csetsize)
1413	if (col[uc1] != col[uc2])
1414	return(`0`);
1415	return(`1`);
1416	}
1417
1418	/*
1419	- categorize - sort out character categories
1420	*/
1421	static void
1422	categorize(struct parse p, struct* re_guts *g)
1423	{
1424	cat_t *cats = g->categories;
1425	int c;
1426	int c2;
1427	cat_t cat;
1428
1429	/ avoid making error situations worse /
1430	if (p->error != `0`)
1431	return;
1432
1433	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1434	if (cats[c] == `0` && isinsets(g, c)) {
1435	cat = g->ncategories++;
1436	cats[c] = cat;
1437	for (c2 = c+`1`; c2 <= CHAR_MAX; c2++)
1438	if (cats[c2] == `0` && samesets(g, c1: c, c2))
1439	cats[c2] = cat;
1440	}
1441	}
1442
1443	/*
1444	- dupl - emit a duplicate of a bunch of sops
1445	*/
1446	static sopno / start of duplicate /
1447	dupl(struct parse *p,
1448	sopno start, / from here /
1449	sopno finish) / to this less one /
1450	{
1451	sopno ret = HERE();
1452	sopno len = finish - start;
1453
1454	assert(finish >= start);
1455	if (len == `0`)
1456	return(ret);
1457	enlarge(p, p->ssize + len); / this many unexpected additions /
1458	assert(p->ssize >= p->slen + len);
1459	(void) memmove(dest: (char *)(p->strip + p->slen),
1460	src: (char )(p->strip + start), n: (size_t)lensizeof(sop));
1461	p->slen += len;
1462	return(ret);
1463	}
1464
1465	/*
1466	- doemit - emit a strip operator
1467	*
1468	* It might seem better to implement this as a macro with a function as
1469	* hard-case backup, but it's just too big and messy unless there are
1470	* some changes to the data structures. Maybe later.
1471	*/
1472	static void
1473	doemit(struct parse *p, sop op, size_t opnd)
1474	{
1475	/ avoid making error situations worse /
1476	if (p->error != `0`)
1477	return;
1478
1479	/ deal with oversize operands ("can't happen", more or less) /
1480	assert(opnd < `1`<<OPSHIFT);
1481
1482	/ deal with undersized strip /
1483	if (p->slen >= p->ssize)
1484	enlarge(p, (p->ssize+`1`) / `2` * `3`); / +50% /
1485	assert(p->slen < p->ssize);
1486
1487	/ finally, it's all reduced to the easy case /
1488	p->strip[p->slen++] = SOP(op, opnd);
1489	}
1490
1491	/*
1492	- doinsert - insert a sop into the strip
1493	*/
1494	static void
1495	doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1496	{
1497	sopno sn;
1498	sop s;
1499	int i;
1500
1501	/ avoid making error situations worse /
1502	if (p->error != `0`)
1503	return;
1504
1505	sn = HERE();
1506	EMIT(op, opnd); / do checks, ensure space /
1507	assert(HERE() == sn+`1`);
1508	s = p->strip[sn];
1509
1510	/ adjust paren pointers /
1511	assert(pos > `0`);
1512	for (i = `1`; i < NPAREN; i++) {
1513	if (p->pbegin[i] >= pos) {
1514	p->pbegin[i]++;
1515	}
1516	if (p->pend[i] >= pos) {
1517	p->pend[i]++;
1518	}
1519	}
1520
1521	memmove(dest: (char )&p->strip[pos+`1`], src: (char* *)&p->strip[pos],
1522	n: (HERE()-pos-`1`)*sizeof(sop));
1523	p->strip[pos] = s;
1524	}
1525
1526	/*
1527	- dofwd - complete a forward reference
1528	*/
1529	static void
1530	dofwd(struct parse *p, sopno pos, sop value)
1531	{
1532	/ avoid making error situations worse /
1533	if (p->error != `0`)
1534	return;
1535
1536	assert(value < `1`<<OPSHIFT);
1537	p->strip[pos] = OP(p->strip[pos]) \| value;
1538	}
1539
1540	/*
1541	- enlarge - enlarge the strip
1542	*/
1543	static void
1544	enlarge(struct parse *p, sopno size)
1545	{
1546	sop *sp;
1547
1548	if (p->ssize >= size)
1549	return;
1550
1551	if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
1552	SETERROR(REG_ESPACE);
1553	return;
1554	}
1555
1556	sp = (sop )realloc(ptr: p->strip, size: sizesizeof(sop));
1557	if (sp == NULL) {
1558	SETERROR(REG_ESPACE);
1559	return;
1560	}
1561	p->strip = sp;
1562	p->ssize = size;
1563	}
1564
1565	/*
1566	- stripsnug - compact the strip
1567	*/
1568	static void
1569	stripsnug(struct parse p, struct* re_guts *g)
1570	{
1571	g->nstates = p->slen;
1572	if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
1573	g->strip = p->strip;
1574	SETERROR(REG_ESPACE);
1575	return;
1576	}
1577
1578	g->strip = (sop )realloc(ptr: (char* )p->strip, size: p->slen sizeof(sop));
1579	if (g->strip == NULL) {
1580	SETERROR(REG_ESPACE);
1581	g->strip = p->strip;
1582	}
1583	}
1584
1585	/*
1586	- findmust - fill in must and mlen with longest mandatory literal string
1587	*
1588	* This algorithm could do fancy things like analyzing the operands of \|
1589	* for common subsequences. Someday. This code is simple and finds most
1590	* of the interesting cases.
1591	*
1592	* Note that must and mlen got initialized during setup.
1593	*/
1594	static void
1595	findmust(struct parse p, struct* re_guts *g)
1596	{
1597	sop *scan;
1598	sop start = `0`; /* start initialized in the default case, after that /
1599	sop newstart = `0`; /* newstart was initialized in the OCHAR case /
1600	sopno newlen;
1601	sop s;
1602	char *cp;
1603	sopno i;
1604
1605	/ avoid making error situations worse /
1606	if (p->error != `0`)
1607	return;
1608
1609	/ find the longest OCHAR sequence in strip /
1610	newlen = `0`;
1611	scan = g->strip + `1`;
1612	do {
1613	s = *scan++;
1614	switch (OP(s)) {
1615	case OCHAR: / sequence member /
1616	if (newlen == `0`) / new sequence /
1617	newstart = scan - `1`;
1618	newlen++;
1619	break;
1620	case OPLUS_: / things that don't break one /
1621	case OLPAREN:
1622	case ORPAREN:
1623	break;
1624	case OQUEST_: / things that must be skipped /
1625	case OCH_:
1626	scan--;
1627	do {
1628	scan += OPND(s);
1629	s = *scan;
1630	/ assert() interferes w debug printouts /
1631	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1632	OP(s) != OOR2) {
1633	g->iflags \|= REGEX_BAD;
1634	return;
1635	}
1636	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1637	LLVM_FALLTHROUGH;
1638	default: / things that break a sequence /
1639	if (newlen > g->mlen) { / ends one /
1640	start = newstart;
1641	g->mlen = newlen;
1642	}
1643	newlen = `0`;
1644	break;
1645	}
1646	} while (OP(s) != OEND);
1647
1648	if (g->mlen == `0`) / there isn't one /
1649	return;
1650
1651	/ turn it into a character string /
1652	g->must = malloc(size: (size_t)g->mlen + `1`);
1653	if (g->must == NULL) { / argh; just forget it /
1654	g->mlen = `0`;
1655	return;
1656	}
1657	cp = g->must;
1658	scan = start;
1659	for (i = g->mlen; i > `0`; i--) {
1660	while (OP(s = *scan++) != OCHAR)
1661	continue;
1662	assert(cp < g->must + g->mlen);
1663	cp++ = (char*)OPND(s);
1664	}
1665	assert(cp == g->must + g->mlen);
1666	cp++ = `'\0'`; /* just on general principles /
1667	}
1668
1669	/*
1670	- pluscount - count + nesting
1671	*/
1672	static sopno / nesting depth /
1673	pluscount(struct parse p, struct* re_guts *g)
1674	{
1675	sop *scan;
1676	sop s;
1677	sopno plusnest = `0`;
1678	sopno maxnest = `0`;
1679
1680	if (p->error != `0`)
1681	return(`0`); / there may not be an OEND /
1682
1683	scan = g->strip + `1`;
1684	do {
1685	s = *scan++;
1686	switch (OP(s)) {
1687	case OPLUS_:
1688	plusnest++;
1689	break;
1690	case O_PLUS:
1691	if (plusnest > maxnest)
1692	maxnest = plusnest;
1693	plusnest--;
1694	break;
1695	}
1696	} while (OP(s) != OEND);
1697	if (plusnest != `0`)
1698	g->iflags \|= REGEX_BAD;
1699	return(maxnest);
1700	}
1701

Browse the source code of llvm_projects/llvm/lib/Support/regcomp.c