1/*
2 * Copyright (c) 2002 by The XFree86 Project, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 *
22 * Except as contained in this notice, the name of the XFree86 Project shall
23 * not be used in advertising or otherwise to promote the sale, use or other
24 * dealings in this Software without prior written authorization from the
25 * XFree86 Project.
26 *
27 * Author: Paulo César Pereira de Andrade
28 */
29
30/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */
31
32#include "rep.h"
33
34/*
35 *  This file is a placeholder to add code to analyse and optimize the
36 * intermediate data structure generated in rep.c.
37 *  Character ranges are optimized while being generated.
38 */
39
40/*
41 * Types
42 */
43typedef struct _orec_inf {
44    rec_alt *alt;			/* Main alternatives list */
45    rec_grp *grp;			/* Current group pointer */
46    int flags;
47    int ecode;
48} orec_inf;
49
50/*
51 * Prototypes
52 */
53static int orec_alt(orec_inf*, rec_alt*);
54static int orec_pat(orec_inf*, rec_pat*);
55static int orec_grp(orec_inf*, rec_grp*);
56static int orec_pat_bad_rpt(orec_inf*, rec_pat*);
57static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*);
58static int orec_pat_rng(orec_inf*, rec_pat*);
59static int orec_pat_cse(orec_inf*, rec_pat*);
60static int orec_pat_cse_can(orec_inf*, rec_pat*);
61static int orec_str_list(orec_inf*, rec_alt*, int, int);
62
63/*
64 * Initialization
65 */
66extern unsigned char re__alnum[256];
67extern unsigned char re__odigit[256];
68extern unsigned char re__ddigit[256];
69extern unsigned char re__xdigit[256];
70extern unsigned char re__control[256];
71
72/*
73 * Implementation
74 */
75int
76orec_comp(rec_alt *alt, int flags)
77{
78    orec_inf inf;
79
80    inf.alt = alt;
81    inf.grp = NULL;
82    inf.flags = flags;
83    inf.ecode = 0;
84
85    orec_alt(&inf, alt);
86
87    return (inf.ecode);
88}
89
90void
91orec_free_stl(rec_stl *stl)
92{
93    int i;
94
95    for (i = 0; i < stl->nstrs; i++) {
96	if (stl->lens[i] > 2)
97	    free(stl->strs[i]);
98    }
99
100    free(stl->lens);
101    free(stl->strs);
102    free(stl);
103}
104
105
106static int
107orec_alt(orec_inf *inf, rec_alt *alt)
108{
109    if (alt) {
110	rec_alt *ptr = alt;
111	int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0;
112
113	/* Check if can build a string list */
114	if (ptr->next) {
115	    /* If more than one alternative */
116	    while (ptr && (str || cstr)) {
117		if (ptr->pat == NULL || ptr->pat->rep != NULL) {
118		    cstr = str = 0;
119		    break;
120		}
121		if ((inf->flags & RE_ICASE)) {
122		    if (!(ret = orec_pat_cse_can(inf, ptr->pat))) {
123			cstr = str = 0;
124			break;
125		    }
126		    if (ret == 1)
127			++lits;
128		    else if (ret == 2)
129			++clits;
130		}
131		else if (ptr->pat->next == NULL) {
132		    if (ptr->pat->type != Rep_String) {
133			if (ptr->pat->type != Rep_Literal) {
134			    str = 0;
135			    if (ptr->pat->type != Rep_CaseString) {
136				if (ptr->pat->type != Rep_CaseLiteral)
137				    cstr = 0;
138				else
139				    ++clits;
140			    }
141			    else if (strlen((char*)ptr->pat->data.str) >= 255)
142				str = cstr = 0;
143			}
144			else
145			    ++lits;
146		    }
147		    else if (strlen((char*)ptr->pat->data.str) >= 255)
148			str = cstr = 0;
149		}
150		else {
151		    str = cstr = 0;
152		    break;
153		}
154		if (++count >= 255)
155		    str = cstr = 0;
156		ptr = ptr->next;
157	    }
158
159	    if (str || cstr) {
160		if (inf->flags & RE_ICASE) {
161		    for (ptr = alt; ptr; ptr = ptr->next) {
162			if (orec_pat_cse(inf, ptr->pat))
163			    return (inf->ecode);
164		    }
165		    str = 0;
166		}
167		return (orec_str_list(inf, alt, str, count));
168	    }
169	}
170	else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) {
171	    /* If the toplevel single alternative */
172	    switch (alt->pat->type) {
173		/*  One of these will always be true for RE_NOSPEC,
174		 * but can also be optimized for simple patterns */
175		case Rep_Literal:
176		    alt->pat->type = Rep_SearchLiteral;
177		    break;
178		case Rep_CaseLiteral:
179		    alt->pat->type = Rep_SearchCaseLiteral;
180		    break;
181		case Rep_String:
182		    alt->pat->type = Rep_SearchString;
183		    break;
184		case Rep_CaseString:
185		    alt->pat->type = Rep_SearchCaseString;
186		    break;
187		default:
188		    break;
189	    }
190	}
191
192	while (alt) {
193	    orec_pat(inf, alt->pat);
194	    alt = alt->next;
195	}
196    }
197
198    return (inf->ecode);
199}
200
201static int
202orec_pat(orec_inf *inf, rec_pat *pat)
203{
204    rec_pat *next;
205
206    while (pat) {
207	switch (pat->type) {
208	    case Rep_AnyAnyTimes:
209		if (pat->next == NULL) {
210		    rec_grp *grp = inf->grp;
211
212		    next = NULL;
213		    while (grp) {
214			next = grp->parent->next;
215			/* Cannot check if is .*$ as the input
216			 * may be a substring */
217			if (next)
218			    break;
219			grp = grp->pgrp;
220		    }
221		    if (next == NULL) {
222			/*    <re>.*    */
223			pat->type = Rep_AnyEatAnyTimes;
224			grp = inf->grp;
225			while (grp) {
226			    --grp->comp;
227			    next = grp->parent->next;
228			    if (next)
229				break;
230			    grp = grp->pgrp;
231			}
232		    }
233		    else if (orec_pat_bad_rpt(inf, next))
234			return (inf->ecode);
235		}
236		else if (orec_pat_bad_rpt(inf, pat->next))
237		    return (inf->ecode);
238		break;
239	    case Rep_AnyMaybe:
240		if (pat->next == NULL) {
241		    rec_grp *grp = inf->grp;
242
243		    next = NULL;
244		    while (grp) {
245			next = grp->parent->next;
246			if (next)
247			    break;
248			grp = grp->pgrp;
249		    }
250		    if (next == NULL) {
251			/*    <re>.?    */
252			pat->type = Rep_AnyEatMaybe;
253			grp = inf->grp;
254			while (grp) {
255			    --grp->comp;
256			    next = grp->parent->next;
257			    if (next)
258				break;
259			    grp = grp->pgrp;
260			}
261		    }
262		    else if (orec_pat_bad_rpt(inf, next))
263			return (inf->ecode);
264		}
265		else if (orec_pat_bad_rpt(inf, pat->next))
266		    return (inf->ecode);
267		break;
268	    case Rep_AnyAtLeast:
269		if (pat->next == NULL) {
270		    rec_grp *grp = inf->grp;
271
272		    next = NULL;
273		    while (grp) {
274			next = grp->parent->next;
275			if (next)
276			    break;
277			grp = grp->pgrp;
278		    }
279		    if (next == NULL) {
280			/*    <re>.+    */
281			pat->type = Rep_AnyEatAtLeast;
282			grp = inf->grp;
283			while (grp) {
284			    --grp->comp;
285			    next = grp->parent->next;
286			    if (next)
287				break;
288			    grp = grp->pgrp;
289			}
290		    }
291		    else if (orec_pat_bad_rpt(inf, next))
292			return (inf->ecode);
293		}
294		else if (orec_pat_bad_rpt(inf, pat->next))
295		    return (inf->ecode);
296		break;
297	    case Rep_Range:
298	    case Rep_RangeNot:
299		orec_pat_rng(inf, pat);
300		break;
301	    case Rep_Group:
302		orec_grp(inf, pat->data.grp);
303		break;
304	    default:
305		break;
306	}
307	pat = pat->next;
308    }
309
310    return (inf->ecode);
311}
312
313static int
314orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat)
315{
316    switch (pat->type) {
317	/* Not really an error, but aren't supported by the library.
318	 * Includes:  .*.*, .+<re>?  .*<re>*, (.*)(<re>*), etc.
319	 */
320
321	/* Not a repetition, but mathes anything... */
322	case Rep_Any:
323
324	/* Zero length matches */
325	case Rep_Eol:
326	    if (!(inf->flags & RE_NEWLINE))
327		break;
328	case Rep_Bol:
329	case Rep_Bow:
330	case Rep_Eow:
331
332	/* Repetitions */
333	case Rep_AnyAnyTimes:
334	case Rep_AnyMaybe:
335	case Rep_AnyAtLeast:
336	    inf->ecode = RE_BADRPT;
337	    break;
338
339	/* Check if the first group element is a complex pattern */
340	case Rep_Group:
341	    if (pat->rep == NULL) {
342		if (pat->data.grp->alt) {
343		    for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) {
344			if (orec_pat_bad_rpt(inf, pat))
345			    break;
346		    }
347		}
348		break;
349	    }
350	    /*FALLTHROUGH*/
351	default:
352	    if (pat->rep)
353		inf->ecode = RE_BADRPT;
354	    break;
355    }
356
357    if (!inf->ecode && pat && pat->next)
358	orec_pat_bad_forward_rpt(inf, pat->next);
359
360    return (inf->ecode);
361}
362
363static int
364orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat)
365{
366    if (pat->rep) {
367	switch (pat->rep->type) {
368	    case Rer_MinMax:
369		if (pat->rep->mine > 0)
370		    break;
371	    case Rer_AnyTimes:
372	    case Rer_Maybe:
373	    case Rer_Max:
374		inf->ecode = RE_BADRPT;
375	    default:
376		break;
377	}
378    }
379    else if (pat->type == Rep_Group &&
380	     pat->data.grp->alt &&
381	     pat->data.grp->alt->pat)
382	orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat);
383
384    return (inf->ecode);
385}
386
387static int
388orec_grp(orec_inf *inf, rec_grp *grp)
389{
390    rec_grp *prev = inf->grp;
391
392    inf->grp = grp;
393    orec_alt(inf, grp->alt);
394    /* Could also just say: inf->grp = grp->gparent */
395    inf->grp = prev;
396
397    return (inf->ecode);
398}
399
400static int
401orec_pat_rng(orec_inf *inf, rec_pat *pat)
402{
403    int i, j[2], count;
404    rec_pat_t type = pat->type;
405    unsigned char *range = pat->data.rng->range;
406
407    for (i = count = j[0] = j[1] = 0; i < 256; i++) {
408	if (range[i]) {
409	    if (count == 2) {
410		++count;
411		break;
412	    }
413	    j[count++] = i;
414	}
415    }
416
417    if (count == 1 ||
418	(count == 2 &&
419	 ((islower(j[0]) && toupper(j[0]) == j[1]) ||
420	  (isupper(j[0]) && tolower(j[0]) == j[1])))) {
421	free(pat->data.rng);
422	if (count == 1) {
423	    pat->data.chr = j[0];
424	    pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot;
425	}
426	else {
427	    pat->data.cse.upper = j[0];
428	    pat->data.cse.lower = j[1];
429	    pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot;
430	}
431    }
432    else {
433	if (memcmp(re__alnum, range, 256) == 0)
434	    type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot;
435	else if (memcmp(re__odigit, range, 256) == 0)
436	    type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot;
437	else if (memcmp(re__ddigit, range, 256) == 0)
438	    type = type == Rep_Range ? Rep_Digit : Rep_DigitNot;
439	else if (memcmp(re__xdigit, range, 256) == 0)
440	    type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot;
441	else if (memcmp(re__control, range, 256) == 0)
442	    type = type == Rep_Range ? Rep_Control : Rep_ControlNot;
443
444	if (type != pat->type) {
445	    free(pat->data.rng);
446	    pat->type = type;
447	}
448    }
449
450    return (inf->ecode);
451}
452
453/*  Join patterns if required, will only fail on memory allocation failure:
454 */
455static int
456orec_pat_cse(orec_inf *inf, rec_pat *pat)
457{
458    rec_pat_t type;
459    int i, len, length;
460    rec_pat *ptr, *next;
461    unsigned char *str, *tofree;
462
463    if (pat->next == NULL && pat->type == Rep_CaseString)
464	return (inf->ecode);
465
466    type = Rep_CaseString;
467
468    /* First calculate how many bytes will be required */
469    for (ptr = pat, length = 1; ptr; ptr = ptr->next) {
470	switch (ptr->type) {
471	    case Rep_Literal:
472		length += 2;
473		break;
474	    case Rep_String:
475		length += strlen((char*)ptr->data.str) << 1;
476		break;
477	    case Rep_CaseLiteral:
478		length += 2;
479		break;
480	    case Rep_CaseString:
481		length += strlen((char*)ptr->data.str);
482		break;
483	    default:
484		break;
485	}
486    }
487
488    if ((str = malloc(length)) == NULL)
489	return (inf->ecode = RE_ESPACE);
490
491    for (ptr = pat, length = 0; ptr; ptr = next) {
492	tofree = NULL;
493	next = ptr->next;
494	switch (ptr->type) {
495	    case Rep_Literal:
496		str[length++] = ptr->data.chr;
497		str[length++] = ptr->data.chr;
498		break;
499	    case Rep_String:
500		tofree = ptr->data.str;
501		len = strlen((char*)tofree);
502		for (i = 0; i < len; i++) {
503		    str[length++] = tofree[i];
504		    str[length++] = tofree[i];
505		}
506		break;
507	    case Rep_CaseLiteral:
508		str[length++] = ptr->data.cse.lower;
509		str[length++] = ptr->data.cse.upper;
510		break;
511	    case Rep_CaseString:
512		tofree = ptr->data.str;
513		len = strlen((char*)tofree);
514		memcpy(str + length, tofree, len);
515		length += len;
516		break;
517	    default:
518		break;
519	}
520	if (tofree)
521	    free(tofree);
522	if (ptr != pat)
523	    free(ptr);
524    }
525    str[length] = '\0';
526
527    pat->type = type;
528    pat->data.str = str;
529    pat->next = NULL;
530
531    return (inf->ecode);
532}
533
534/*  Return 0 if the patterns in the list cannot be merged, 1 if will
535 * be a simple string, 2 if a case string.
536 *  This is useful when building an alternative list that is composed
537 * only of strings, but the regex is case insensitive, in wich case
538 * the first pass may have splited some patterns, but if it is a member
539 * of an alternatives list, the cost of using a string list is smaller */
540static int
541orec_pat_cse_can(orec_inf *inf, rec_pat *pat)
542{
543    int ret;
544
545    if (pat == NULL)
546	return (0);
547
548    for (ret = 1; pat; pat = pat->next) {
549	if (pat->rep)
550	    return (0);
551	switch (pat->type) {
552	    case Rep_Literal:
553	    case Rep_String:
554		break;
555	    case Rep_CaseLiteral:
556	    case Rep_CaseString:
557		ret = 2;
558		break;
559	    default:
560		return (0);
561	}
562    }
563
564    return (ret);
565}
566
567
568/*  XXX If everything is a (case) byte, the pattern should be
569 * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE)
570 * as a string list works fine, but as a character range
571 * should be faster, and maybe could be converted here. But not
572 * very important, if performance is required, it should have already
573 * been done in the pattern.
574 */
575static int
576orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count)
577{
578    rec_stl *stl;
579    rec_pat *pat;
580    rec_alt *ptr, *next;
581    int i, j, tlen, len, is;
582
583    if ((stl = calloc(1, sizeof(rec_stl))) == NULL)
584	return (inf->ecode = RE_ESPACE);
585
586    if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) {
587	free(stl);
588	return (inf->ecode = RE_ESPACE);
589    }
590
591    if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) {
592	free(stl->lens);
593	free(stl);
594	return (inf->ecode = RE_ESPACE);
595    }
596
597    if ((pat = calloc(1, sizeof(rec_pat))) == NULL) {
598	free(stl->strs);
599	free(stl->lens);
600	free(stl);
601	return (inf->ecode = RE_ESPACE);
602    }
603
604    pat->data.stl = stl;
605    pat->type = Rep_StringList;
606    stl->type = str ? Resl_StringList : Resl_CaseStringList;
607    for (i = tlen = 0, ptr = alt; i < count; i++) {
608	next = ptr->next;
609	switch (ptr->pat->type) {
610	    case Rep_Literal:
611		is = len = 1;
612		break;
613	    case Rep_CaseLiteral:
614		is = len = 2;
615		break;
616	    default:
617		is = 0;
618		len = strlen((char*)ptr->pat->data.str);
619		break;
620	}
621	tlen += len;
622	stl->lens[i] = len;
623	if (!is) {
624	    if (len > 2)
625		stl->strs[i] = ptr->pat->data.str;
626	    else {
627		if (len == 1)
628		    stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]);
629		else
630		    stl->strs[i] = (void*)(long)
631				   (ptr->pat->data.str[0] |
632				    ((int)ptr->pat->data.str[1] << 8));
633		free(ptr->pat->data.str);
634	    }
635	}
636	else {
637	    if (is == 1)
638		stl->strs[i] = (void*)(long)ptr->pat->data.chr;
639	    else
640		stl->strs[i] = (void*)(long)
641			       (ptr->pat->data.cse.lower |
642				(ptr->pat->data.cse.upper << 8));
643	}
644	free(ptr->pat);
645	if (i)
646	    free(ptr);
647	ptr = next;
648    }
649    stl->tlen = tlen;
650    stl->nstrs = count;
651
652    alt->pat = pat;
653    alt->next = NULL;
654
655    {
656	int li, lj;
657	unsigned char ci, cj, *str;
658
659	/*  Don't need a stable sort, there shouldn't be duplicated strings,
660	 * but don't check for it either. Only need to make sure that all
661	 * strings that start with the same byte are together */
662	for (i = 0; i < count; i++) {
663	    li = stl->lens[i];
664	    ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff;
665	    for (j = i + 1; j < count; j++) {
666		lj = stl->lens[j];
667		cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff;
668		if ((count >= LARGE_STL_COUNT && cj < ci) ||
669		    (cj == ci && lj > li)) {
670		    /* If both strings start with the same byte,
671		     * put the longer first */
672		    str = stl->strs[j];
673		    stl->strs[j] = stl->strs[i];
674		    stl->strs[i] = str;
675		    stl->lens[j] = li;
676		    stl->lens[i] = lj;
677		    li ^= lj; lj ^= li; li ^= lj;
678		    ci ^= cj; cj ^= ci; ci ^= cj;
679		}
680	    }
681	}
682    }
683
684    return (inf->ecode);
685}
686