regexJIT.c revision 1.1.1.1.4.2 1 /*
2 * Stack-less Just-In-Time compiler
3 *
4 * Copyright 2009-2010 Zoltan Herczeg (hzmester (at) freemail.hu). All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this list of
10 * conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 * of conditions and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "sljitLir.h"
28 #include "regexJIT.h"
29
30 #ifdef REGEX_MATCH_VERBOSE
31 #include <stdio.h>
32 #endif
33
34 /* Extra, hidden flags:
35 {id!} where id > 0 found in the code. */
36 #define REGEX_ID_CHECK 0x100
37 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
38 which starts with [\r\n] character range. */
39 #define REGEX_FAKE_MATCH_BEGIN 0x200
40 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
41 which ends with [\r\n] character range. */
42 #define REGEX_FAKE_MATCH_END 0x400
43
44 /* Check match completition after every (FINISH_TEST + 1) steps. */
45 #define FINISH_TEST 0x7
46
47 /* --------------------------------------------------------------------- */
48 /* Structures for JIT-ed pattern matching */
49 /* --------------------------------------------------------------------- */
50
51 struct regex_machine
52 {
53 /* flags. */
54 int flags;
55 /* Number of state descriptors for one term. */
56 sljit_w no_states;
57 /* Total size. */
58 sljit_w size;
59
60 union {
61 void *init_match;
62 sljit_w (SLJIT_CALL *call_init)(void *next, void* match);
63 } u;
64 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
65 struct sljit_function_context context;
66 #endif
67
68 void *continue_match;
69
70 /* Variable sized array to contain the handler addresses. */
71 sljit_uw entry_addrs[1];
72 };
73
74 struct regex_match
75 {
76 /* Current and next state array. */
77 sljit_w *current;
78 sljit_w *next;
79 /* Starting. */
80 sljit_w head;
81 /* String character index (ever increasing). */
82 sljit_w index;
83 /* Best match found so far (members in priority order). */
84 sljit_w best_begin;
85 sljit_w best_end;
86 sljit_w best_id;
87 /* Bool flags (encoded as word). */
88 sljit_w fast_quit;
89 sljit_w fast_forward;
90 /* Machine. */
91 struct regex_machine *machine;
92
93 union {
94 void *continue_match;
95 void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
96 } u;
97
98 /* Variable sized array to contain the state arrays. */
99 sljit_w states[1];
100 };
101
102 /* State vector
103 ITEM[0] - pointer to the address inside the machine code
104 ITEM[1] - next pointer
105 ITEM[2] - string started from (optional)
106 ITEM[3] - max ID (optional) */
107
108 /* Register allocation. */
109 /* Current state array (loaded & stored: regex_match->current). */
110 #define R_CURR_STATE SLJIT_SAVED_REG1
111 /* Next state array (loaded & stored: regex_match->next). */
112 #define R_NEXT_STATE SLJIT_SAVED_REG2
113 /* Head (loaded & stored: regex_match->head). */
114 #define R_NEXT_HEAD SLJIT_SAVED_REG3
115 /* String fragment pointer. */
116 #define R_STRING SLJIT_SAVED_EREG1
117 /* String fragment length. */
118 #define R_LENGTH SLJIT_SAVED_EREG2
119 /* 'struct regex_match*' */
120 #define R_REGEX_MATCH SLJIT_TEMPORARY_REG1
121 /* Current character. */
122 #define R_CURR_CHAR SLJIT_TEMPORARY_REG2
123 /* Temporary register. */
124 #define R_TEMP SLJIT_TEMPORARY_REG3
125 /* Caches the regex_match->best_begin. */
126 #define R_BEST_BEGIN SLJIT_TEMPORARY_EREG1
127 /* Current character index. */
128 #define R_CURR_INDEX SLJIT_TEMPORARY_EREG2
129
130 /* --------------------------------------------------------------------- */
131 /* Stack management */
132 /* --------------------------------------------------------------------- */
133
134 /* Try to allocate 2^n blocks. */
135 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
136
137 struct stack_item {
138 int type;
139 int value;
140 };
141
142 struct stack_fragment_data {
143 struct stack_fragment *next;
144 struct stack_fragment *prev;
145 };
146
147 struct stack_fragment {
148 struct stack_fragment_data data;
149 struct stack_item items[STACK_FRAGMENT_SIZE];
150 };
151
152 struct stack {
153 struct stack_fragment *first;
154 struct stack_fragment *last;
155 int index;
156 int count;
157 };
158
159 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
160
161 static void stack_check(struct stack *stack)
162 {
163 struct stack_fragment *curr;
164 int found;
165
166 if (!stack)
167 return;
168
169 SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
170
171 if (stack->first == NULL) {
172 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
173 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
174 return;
175 }
176
177 found = 0;
178 if (stack->last == NULL) {
179 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
180 found = 1;
181 }
182 else
183 SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
184
185 SLJIT_ASSERT(stack->first->data.prev == NULL);
186 curr = stack->first;
187 while (curr) {
188 if (curr == stack->last)
189 found = 1;
190 if (curr->data.next)
191 SLJIT_ASSERT(curr->data.next->data.prev == curr);
192 curr = curr->data.next;
193 }
194 SLJIT_ASSERT(found);
195 }
196
197 #endif
198
199 static void stack_init(struct stack *stack)
200 {
201 stack->first = NULL;
202 stack->last = NULL;
203 stack->index = STACK_FRAGMENT_SIZE - 1;
204 stack->count = 0;
205 }
206
207 static void stack_destroy(struct stack *stack)
208 {
209 struct stack_fragment *curr = stack->first;
210 struct stack_fragment *prev;
211
212 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
213 stack_check(stack);
214 #endif
215
216 while (curr) {
217 prev = curr;
218 curr = curr->data.next;
219 SLJIT_FREE(prev);
220 }
221 }
222
223 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
224 {
225 SLJIT_ASSERT(stack->last);
226 return stack->last->items + stack->index;
227 }
228
229 static int stack_push(struct stack *stack, int type, int value)
230 {
231 if (stack->last) {
232 stack->index++;
233 if (stack->index >= STACK_FRAGMENT_SIZE) {
234 stack->index = 0;
235 if (!stack->last->data.next) {
236 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment));
237 if (!stack->last->data.next)
238 return 1;
239 stack->last->data.next->data.next = NULL;
240 stack->last->data.next->data.prev = stack->last;
241 }
242 stack->last = stack->last->data.next;
243 }
244 }
245 else if (!stack->first) {
246 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment));
247 if (!stack->last)
248 return 1;
249 stack->last->data.prev = NULL;
250 stack->last->data.next = NULL;
251 stack->first = stack->last;
252 stack->index = 0;
253 }
254 else {
255 stack->last = stack->first;
256 stack->index = 0;
257 }
258 stack->last->items[stack->index].type = type;
259 stack->last->items[stack->index].value = value;
260 stack->count++;
261 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
262 stack_check(stack);
263 #endif
264 return 0;
265 }
266
267 static struct stack_item* stack_pop(struct stack *stack)
268 {
269 struct stack_item *ret = stack_top(stack);
270
271 if (stack->index > 0)
272 stack->index--;
273 else {
274 stack->last = stack->last->data.prev;
275 stack->index = STACK_FRAGMENT_SIZE - 1;
276 }
277
278 stack->count--;
279 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
280 stack_check(stack);
281 #endif
282 return ret;
283 }
284
285 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
286 {
287 *dst = *src;
288 }
289
290 static int stack_push_copy(struct stack *stack, int items, int length)
291 {
292 struct stack_fragment *frag1;
293 int ind1;
294 struct stack_fragment *frag2;
295 int ind2;
296 int counter;
297
298 SLJIT_ASSERT(stack->count >= length && items <= length && items > 0);
299
300 /* Allocate the necessary elements. */
301 counter = items;
302 frag1 = stack->last;
303 ind1 = stack->index;
304 while (counter > 0) {
305 if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
306 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
307 stack->index = 0;
308 if (!stack->last->data.next) {
309 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment));
310 if (!stack->last->data.next)
311 return 1;
312 stack->last->data.next->data.next = NULL;
313 stack->last->data.next->data.prev = stack->last;
314 }
315 stack->last = stack->last->data.next;
316 }
317 else {
318 stack->index += counter;
319 counter = 0;
320 }
321 }
322
323 frag2 = stack->last;
324 ind2 = stack->index;
325 while (length > 0) {
326 frag2->items[ind2--] = frag1->items[ind1--];
327 if (ind1 < 0) {
328 ind1 = STACK_FRAGMENT_SIZE - 1;
329 frag1 = frag1->data.prev;
330 }
331 if (ind2 < 0) {
332 ind2 = STACK_FRAGMENT_SIZE - 1;
333 frag2 = frag2->data.prev;
334 }
335 length--;
336 }
337
338 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
339 stack_check(stack);
340 #endif
341 stack->count += items;
342 return 0;
343 }
344
345 /* --------------------------------------------------------------------- */
346 /* Parser */
347 /* --------------------------------------------------------------------- */
348
349 enum {
350 /* Common. */
351 type_begin,
352 type_end,
353 type_char,
354 type_newline,
355 type_id,
356 type_rng_start,
357 type_rng_end,
358 type_rng_char,
359 type_rng_left,
360 type_rng_right,
361
362 /* generator only. */
363 type_branch,
364 type_jump,
365
366 /* Parser only. */
367 type_open_br,
368 type_close_br,
369 type_select,
370 type_asterisk,
371 type_plus_sign,
372 type_qestion_mark
373 };
374
375 struct compiler_common {
376 /* Temporary stacks. */
377 struct stack stack;
378 struct stack depth;
379 /* REGEX_ flags. */
380 int flags;
381 /* Encoded size of the dfa representation. */
382 sljit_w dfa_size;
383 /* Number of terms. */
384 sljit_w terms_size;
385 /* Number of state descriptors for one term (same as machine->no_states). */
386 sljit_w no_states;
387 /* Number of type_rng_(char|left)-s in the longest character range. */
388 sljit_w longest_range_size;
389
390 /* DFA linear representation (size: dfa_size). */
391 struct stack_item *dfa_transitions;
392 /* Term id and search state pairs (size: dfa_size). */
393 struct stack_item *search_states;
394
395 /* sljit compiler */
396 struct sljit_compiler *compiler;
397 /* Machine data, which must be kept for later use. */
398 struct regex_machine *machine;
399 /* Temporary space for jumps (size: longest_range_size). */
400 struct sljit_jump **range_jump_list;
401 };
402
403 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
404 {
405 int value = 0;
406
407 SLJIT_ASSERT(length > 0);
408 if (*regex_string < '0' || *regex_string > '9') {
409 *result = -1;
410 return regex_string;
411 }
412
413 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
414 value = value * 10 + (*regex_string - '0');
415 length--;
416 regex_string++;
417 }
418
419 *result = value;
420 return regex_string;
421 }
422
423 static int iterate(struct stack *stack, int min, int max)
424 {
425 struct stack it;
426 struct stack_item *item;
427 int count = -1;
428 int len = 0;
429 int depth = 0;
430
431 stack_clone(stack, &it);
432
433 /* Calculate size. */
434 while (count < 0) {
435 item = stack_pop(&it);
436 switch (item->type) {
437 case type_id:
438 case type_rng_end:
439 case type_rng_char:
440 case type_rng_left:
441 case type_rng_right:
442 case type_plus_sign:
443 case type_qestion_mark:
444 len++;
445 break;
446
447 case type_asterisk:
448 len += 2;
449 break;
450
451 case type_close_br:
452 depth++;
453 break;
454
455 case type_open_br:
456 SLJIT_ASSERT(depth > 0);
457 depth--;
458 if (depth == 0)
459 count = it.count;
460 break;
461
462 case type_select:
463 SLJIT_ASSERT(depth > 0);
464 len += 2;
465 break;
466
467 default:
468 SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
469 if (depth == 0)
470 count = it.count;
471 len++;
472 break;
473 }
474 }
475
476 if (min == 0 && max == 0) {
477 /* {0,0} case, not {0,} case: delete subtree. */
478 stack_clone(&it, stack);
479 /* and put an empty bracket expression instead of it. */
480 if (stack_push(stack, type_open_br, 0))
481 return REGEX_MEMORY_ERROR;
482 if (stack_push(stack, type_close_br, 0))
483 return REGEX_MEMORY_ERROR;
484 return len;
485 }
486
487 count = stack->count - count;
488
489 /* Put an open bracket before the sequence. */
490 if (stack_push_copy(stack, 1, count))
491 return -1;
492
493 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
494 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
495 #else
496 stack_push(&it, type_open_br, 0);
497 #endif
498
499 /* Copy the data. */
500 if (max > 0) {
501 len = len * (max - 1);
502 max -= min;
503 /* Insert ? operators. */
504 len += max;
505
506 if (min > 0) {
507 min--;
508 while (min > 0) {
509 if (stack_push_copy(stack, count, count))
510 return -1;
511 min--;
512 }
513 if (max > 0) {
514 if (stack_push_copy(stack, count, count))
515 return -1;
516 if (stack_push(stack, type_qestion_mark, 0))
517 return REGEX_MEMORY_ERROR;
518 count++;
519 max--;
520 }
521 }
522 else {
523 SLJIT_ASSERT(max > 0);
524 max--;
525 count++;
526 if (stack_push(stack, type_qestion_mark, 0))
527 return REGEX_MEMORY_ERROR;
528 }
529
530 while (max > 0) {
531 if (stack_push_copy(stack, count, count))
532 return -1;
533 max--;
534 }
535 }
536 else {
537 SLJIT_ASSERT(min > 0);
538 min--;
539 /* Insert + operator. */
540 len = len * min + 1;
541 while (min > 0) {
542 if (stack_push_copy(stack, count, count))
543 return -1;
544 min--;
545 }
546
547 if (stack_push(stack, type_plus_sign, 0))
548 return REGEX_MEMORY_ERROR;
549 }
550
551 /* Close the opened bracket. */
552 if (stack_push(stack, type_close_br, 0))
553 return REGEX_MEMORY_ERROR;
554
555 return len;
556 }
557
558 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_w *dfa_size, int begin)
559 {
560 /* We only know that *regex_string == { . */
561 int val1, val2;
562 const regex_char_t *base_from = regex_string;
563 const regex_char_t *from;
564
565 length--;
566 regex_string++;
567
568 /* Decode left value. */
569 val2 = -1;
570 if (length == 0)
571 return -2;
572 if (*regex_string == ',') {
573 val1 = 0;
574 length--;
575 regex_string++;
576 }
577 else {
578 from = regex_string;
579 regex_string = decode_number(regex_string, length, &val1);
580 if (val1 < 0)
581 return -2;
582 length -= regex_string - from;
583
584 if (length == 0)
585 return -2;
586 if (*regex_string == '}') {
587 val2 = val1;
588 if (val1 == 0)
589 val1 = -1;
590 }
591 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
592 /* Non posix extension. */
593 if (stack_push(stack, type_id, val1))
594 return -1;
595 (*dfa_size)++;
596 return (regex_string - base_from) + 1;
597 }
598 else {
599 if (*regex_string != ',')
600 return -2;
601 length--;
602 regex_string++;
603 }
604 }
605
606 if (begin)
607 return -2;
608
609 /* Decode right value. */
610 if (val2 == -1) {
611 if (length == 0)
612 return -2;
613 if (*regex_string == '}')
614 val2 = 0;
615 else {
616 from = regex_string;
617 regex_string = decode_number(regex_string, length, &val2);
618 length -= regex_string - from;
619 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
620 return -2;
621 if (val2 == 0) {
622 SLJIT_ASSERT(val1 == 0);
623 val1 = -1;
624 }
625 }
626 }
627
628 /* Fast cases. */
629 if (val1 > 1 || val2 > 1) {
630 val1 = iterate(stack, val1, val2);
631 if (val1 < 0)
632 return -1;
633 *dfa_size += val1;
634 }
635 else if (val1 == 0 && val2 == 0) {
636 if (stack_push(stack, type_asterisk, 0))
637 return -1;
638 *dfa_size += 2;
639 }
640 else if (val1 == 1 && val2 == 0) {
641 if (stack_push(stack, type_plus_sign, 0))
642 return -1;
643 (*dfa_size)++;
644 }
645 else if (val1 == 0 && val2 == 1) {
646 if (stack_push(stack, type_qestion_mark, 0))
647 return -1;
648 (*dfa_size)++;
649 }
650 else if (val1 == -1) {
651 val1 = iterate(stack, 0, 0);
652 if (val1 < 0)
653 return -1;
654 *dfa_size -= val1;
655 SLJIT_ASSERT(*dfa_size >= 2);
656 }
657 else {
658 /* Ignore. */
659 SLJIT_ASSERT(val1 == 1 && val2 == 1);
660 }
661 return regex_string - base_from;
662 }
663
664 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
665 {
666 struct stack* stack = &compiler_common->stack;
667 const regex_char_t *base_from = regex_string;
668 regex_char_t left_char, right_char, tmp_char;
669
670 length--;
671 regex_string++;
672
673 if (length == 0)
674 return -2;
675
676 if (*regex_string != '^') {
677 if (stack_push(stack, type_rng_start, 0))
678 return -1;
679 }
680 else {
681 length--;
682 regex_string++;
683
684 if (length == 0)
685 return -2;
686
687 if (stack_push(stack, type_rng_start, 1))
688 return -1;
689 }
690 /* For both the type_rng_start & type_rng_end. */
691 compiler_common->dfa_size += 2;
692
693 /* Range must be at least 1 character. */
694 if (*regex_string == ']') {
695 length--;
696 regex_string++;
697 if (stack_push(stack, type_rng_char, ']'))
698 return -1;
699 compiler_common->dfa_size++;
700 }
701
702 while (1) {
703 if (length == 0)
704 return -2;
705
706 if (*regex_string == ']')
707 break;
708
709 if (*regex_string != '\\')
710 left_char = *regex_string;
711 else {
712 regex_string++;
713 length--;
714 if (length == 0)
715 return -2;
716 left_char = *regex_string;
717 }
718 regex_string++;
719 length--;
720
721 /* Is a range here? */
722 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
723 regex_string++;
724 length--;
725
726 if (*regex_string != '\\')
727 right_char = *regex_string;
728 else {
729 regex_string++;
730 length--;
731 if (length == 0)
732 return -2;
733 right_char = *regex_string;
734 }
735 regex_string++;
736 length--;
737
738 if (left_char > right_char) {
739 /* Swap if necessary. */
740 tmp_char = left_char;
741 left_char = right_char;
742 right_char = tmp_char;
743 }
744
745 if (stack_push(stack, type_rng_left, left_char))
746 return -1;
747 if (stack_push(stack, type_rng_right, right_char))
748 return -1;
749 compiler_common->dfa_size += 2;
750 }
751 else {
752 if (stack_push(stack, type_rng_char, left_char))
753 return -1;
754 compiler_common->dfa_size++;
755 }
756 }
757
758 if (stack_push(stack, type_rng_end, 0))
759 return -1;
760 return regex_string - base_from;
761 }
762
763 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
764 {
765 /* Depth of bracketed expressions. */
766 int depth = 0;
767 /* Have we already found a term? '1' if not yet. */
768 int begin = 1;
769 /* Cache stack pointer. */
770 struct stack* stack = &compiler_common->stack;
771 int tmp;
772
773 /* Type_begin and type_end. */
774 compiler_common->dfa_size = 2;
775 stack_init(stack);
776 if (stack_push(stack, type_begin, 0))
777 return REGEX_MEMORY_ERROR;
778
779 if (length > 0 && *regex_string == '^') {
780 compiler_common->flags |= REGEX_MATCH_BEGIN;
781 length--;
782 regex_string++;
783 }
784
785 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
786 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
787 compiler_common->flags &= ~REGEX_MATCH_BEGIN;
788 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
789 /* and append a new-line search. */
790 if (stack_push(stack, type_newline, 0))
791 return REGEX_MEMORY_ERROR;
792 compiler_common->dfa_size++;
793 /* Begin intentionally kept as 1. */
794 }
795
796 while (length > 0) {
797 switch (*regex_string) {
798 case '\\' :
799 length--;
800 regex_string++;
801 if (length == 0)
802 return REGEX_INVALID_REGEX;
803 if (stack_push(stack, type_char, *regex_string))
804 return REGEX_MEMORY_ERROR;
805 begin = 0;
806 compiler_common->dfa_size++;
807 break;
808
809 case '.' :
810 if (stack_push(stack, type_rng_start, 1))
811 return REGEX_MEMORY_ERROR;
812 if (compiler_common->flags & REGEX_NEWLINE) {
813 if (stack_push(stack, type_rng_char, '\n'))
814 return REGEX_MEMORY_ERROR;
815 if (stack_push(stack, type_rng_char, '\r'))
816 return REGEX_MEMORY_ERROR;
817 compiler_common->dfa_size += 2;
818 }
819 if (stack_push(stack, type_rng_end, 1))
820 return REGEX_MEMORY_ERROR;
821 begin = 0;
822 compiler_common->dfa_size += 2;
823 break;
824
825 case '(' :
826 depth++;
827 if (stack_push(stack, type_open_br, 0))
828 return REGEX_MEMORY_ERROR;
829 begin = 1;
830 break;
831
832 case ')' :
833 if (depth == 0)
834 return REGEX_INVALID_REGEX;
835 depth--;
836 if (stack_push(stack, type_close_br, 0))
837 return REGEX_MEMORY_ERROR;
838 begin = 0;
839 break;
840
841 case '|' :
842 if (stack_push(stack, type_select, 0))
843 return REGEX_MEMORY_ERROR;
844 begin = 1;
845 compiler_common->dfa_size += 2;
846 break;
847
848 case '*' :
849 if (begin)
850 return REGEX_INVALID_REGEX;
851 if (stack_push(stack, type_asterisk, 0))
852 return REGEX_MEMORY_ERROR;
853 compiler_common->dfa_size += 2;
854 break;
855
856 case '?' :
857 case '+' :
858 if (begin)
859 return REGEX_INVALID_REGEX;
860 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
861 return REGEX_MEMORY_ERROR;
862 compiler_common->dfa_size++;
863 break;
864
865 case '{' :
866 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
867
868 if (tmp >= 0) {
869 length -= tmp;
870 regex_string += tmp;
871 }
872 else if (tmp == -1)
873 return REGEX_MEMORY_ERROR;
874 else {
875 /* Not a valid range expression. */
876 SLJIT_ASSERT(tmp == -2);
877 if (stack_push(stack, type_char, '{'))
878 return REGEX_MEMORY_ERROR;
879 compiler_common->dfa_size++;
880 }
881 break;
882
883 case '[' :
884 tmp = parse_char_range(regex_string, length, compiler_common);
885 if (tmp >= 0) {
886 length -= tmp;
887 regex_string += tmp;
888 }
889 else if (tmp == -1)
890 return REGEX_MEMORY_ERROR;
891 else {
892 SLJIT_ASSERT(tmp == -2);
893 return REGEX_INVALID_REGEX;
894 }
895 begin = 0;
896 break;
897
898 default:
899 if (length == 1 && *regex_string == '$') {
900 compiler_common->flags |= REGEX_MATCH_END;
901 break;
902 }
903 if (stack_push(stack, type_char, *regex_string))
904 return REGEX_MEMORY_ERROR;
905 begin = 0;
906 compiler_common->dfa_size++;
907 break;
908 }
909 length--;
910 regex_string++;
911 }
912
913 if (depth != 0)
914 return REGEX_INVALID_REGEX;
915
916 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
917 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
918 compiler_common->flags &= ~REGEX_MATCH_END;
919 compiler_common->flags |= REGEX_FAKE_MATCH_END;
920 /* and append a new-line search. */
921 if (stack_push(stack, type_newline, 1))
922 return REGEX_MEMORY_ERROR;
923 compiler_common->dfa_size++;
924 /* Begin intentionally kept as 1. */
925 }
926
927 if (stack_push(stack, type_end, 0))
928 return REGEX_MEMORY_ERROR;
929
930 return REGEX_NO_ERROR;
931 }
932
933 /* --------------------------------------------------------------------- */
934 /* Generating machine state transitions */
935 /* --------------------------------------------------------------------- */
936
937 #define PUT_TRANSITION(typ, val) \
938 do { \
939 --transitions_ptr; \
940 transitions_ptr->type = typ; \
941 transitions_ptr->value = val; \
942 } while (0)
943
944 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
945 {
946 struct stack_item *item;
947
948 while (1) {
949 item = stack_top(depth);
950
951 switch (item->type) {
952 case type_asterisk:
953 SLJIT_ASSERT(transitions[item->value].type == type_branch);
954 transitions[item->value].value = transitions_ptr - transitions;
955 PUT_TRANSITION(type_branch, item->value + 1);
956 break;
957
958 case type_plus_sign:
959 SLJIT_ASSERT(transitions[item->value].type == type_branch);
960 transitions[item->value].value = transitions_ptr - transitions;
961 break;
962
963 case type_qestion_mark:
964 PUT_TRANSITION(type_branch, item->value);
965 break;
966
967 default:
968 return transitions_ptr;
969 }
970 stack_pop(depth);
971 }
972 }
973
974 static int generate_transitions(struct compiler_common *compiler_common)
975 {
976 struct stack *stack = &compiler_common->stack;
977 struct stack *depth = &compiler_common->depth;
978 struct stack_item *transitions_ptr;
979 struct stack_item *item;
980
981 stack_init(depth);
982 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size);
983 if (!compiler_common->dfa_transitions)
984 return REGEX_MEMORY_ERROR;
985
986 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
987 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
988 while (stack->count > 0) {
989 item = stack_pop(stack);
990 switch (item->type) {
991 case type_begin:
992 case type_open_br:
993 item = stack_pop(depth);
994 if (item->type == type_select)
995 PUT_TRANSITION(type_branch, item->value + 1);
996 else
997 SLJIT_ASSERT(item->type == type_close_br);
998 if (stack->count == 0)
999 PUT_TRANSITION(type_begin, 0);
1000 else
1001 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1002 break;
1003
1004 case type_end:
1005 case type_close_br:
1006 if (item->type == type_end)
1007 *--transitions_ptr = *item;
1008 if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions))
1009 return REGEX_MEMORY_ERROR;
1010 break;
1011
1012 case type_select:
1013 item = stack_top(depth);
1014 if (item->type == type_select) {
1015 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1016 PUT_TRANSITION(type_branch, item->value + 1);
1017 PUT_TRANSITION(type_jump, item->value);
1018 item->value = transitions_ptr - compiler_common->dfa_transitions;
1019 }
1020 else {
1021 SLJIT_ASSERT(item->type == type_close_br);
1022 item->type = type_select;
1023 PUT_TRANSITION(type_jump, item->value);
1024 item->value = transitions_ptr - compiler_common->dfa_transitions;
1025 }
1026 break;
1027
1028 case type_asterisk:
1029 case type_plus_sign:
1030 case type_qestion_mark:
1031 if (item->type != type_qestion_mark)
1032 PUT_TRANSITION(type_branch, 0);
1033 if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions))
1034 return REGEX_MEMORY_ERROR;
1035 break;
1036
1037 case type_char:
1038 case type_newline:
1039 case type_rng_start:
1040 /* Requires handle_iteratives. */
1041 *--transitions_ptr = *item;
1042 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1043 break;
1044
1045 default:
1046 *--transitions_ptr = *item;
1047 break;
1048 }
1049 }
1050
1051 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1052 SLJIT_ASSERT(depth->count == 0);
1053 return REGEX_NO_ERROR;
1054 }
1055
1056 #undef PUT_TRANSITION
1057
1058 #ifdef REGEX_MATCH_VERBOSE
1059
1060 static void verbose_transitions(struct compiler_common *compiler_common)
1061 {
1062 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1063 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1064 struct stack_item *search_states_ptr = compiler_common->search_states;
1065 int pos;
1066
1067 printf("-----------------\nTransitions\n-----------------\n");
1068 pos = 0;
1069 while (transitions_ptr < transitions_end) {
1070 printf("[%3d] ", pos++);
1071 if (search_states_ptr->type >= 0)
1072 printf("(%3d) ", search_states_ptr->type);
1073 switch (transitions_ptr->type) {
1074 case type_begin:
1075 printf("type_begin\n");
1076 break;
1077
1078 case type_end:
1079 printf("type_end\n");
1080 break;
1081
1082 case type_char:
1083 if (transitions_ptr->value >= ' ')
1084 printf("type_char '%c'\n", transitions_ptr->value);
1085 else
1086 printf("type_char 0x%x\n", transitions_ptr->value);
1087 break;
1088
1089 case type_newline:
1090 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1091 break;
1092
1093 case type_id:
1094 printf("type_id %d\n", transitions_ptr->value);
1095 break;
1096
1097 case type_rng_start:
1098 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1099 break;
1100
1101 case type_rng_end:
1102 printf("type_rng_end\n");
1103 break;
1104
1105 case type_rng_char:
1106 if (transitions_ptr->value >= ' ')
1107 printf("type_rng_char '%c'\n", transitions_ptr->value);
1108 else
1109 printf("type_rng_char 0x%x\n", transitions_ptr->value);
1110 break;
1111
1112 case type_rng_left:
1113 if (transitions_ptr->value >= ' ')
1114 printf("type_rng_left '%c'\n", transitions_ptr->value);
1115 else
1116 printf("type_rng_left 0x%x\n", transitions_ptr->value);
1117 break;
1118
1119 case type_rng_right:
1120 if (transitions_ptr->value >= ' ')
1121 printf("type_rng_right '%c'\n", transitions_ptr->value);
1122 else
1123 printf("type_rng_right 0x%x\n", transitions_ptr->value);
1124 break;
1125
1126 case type_branch:
1127 printf("type_branch -> %d\n", transitions_ptr->value);
1128 break;
1129
1130 case type_jump:
1131 printf("type_jump -> %d\n", transitions_ptr->value);
1132 break;
1133
1134 default:
1135 printf("UNEXPECTED TYPE\n");
1136 break;
1137 }
1138 transitions_ptr++;
1139 search_states_ptr++;
1140 }
1141 printf("flags: ");
1142 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1143 printf("none ");
1144 if (compiler_common->flags & REGEX_MATCH_BEGIN)
1145 printf("REGEX_MATCH_BEGIN ");
1146 if (compiler_common->flags & REGEX_MATCH_END)
1147 printf("REGEX_MATCH_END ");
1148 if (compiler_common->flags & REGEX_NEWLINE)
1149 printf("REGEX_NEWLINE ");
1150 if (compiler_common->flags & REGEX_ID_CHECK)
1151 printf("REGEX_ID_CHECK ");
1152 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1153 printf("REGEX_FAKE_MATCH_BEGIN ");
1154 if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1155 printf("REGEX_FAKE_MATCH_END ");
1156 if (compiler_common->longest_range_size > 0)
1157 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1158 printf("\n");
1159 }
1160
1161 #endif
1162
1163 /* --------------------------------------------------------------------- */
1164 /* Utilities */
1165 /* --------------------------------------------------------------------- */
1166
1167 static int generate_search_states(struct compiler_common *compiler_common)
1168 {
1169 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1170 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1171 struct stack_item *search_states_ptr;
1172 struct stack_item *rng_start = NULL;
1173
1174 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1175 compiler_common->longest_range_size = 0;
1176 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size);
1177 if (!compiler_common->search_states)
1178 return REGEX_MEMORY_ERROR;
1179
1180 search_states_ptr = compiler_common->search_states;
1181 while (transitions_ptr < transitions_end) {
1182 switch (transitions_ptr->type) {
1183 case type_begin:
1184 case type_end:
1185 search_states_ptr->type = 0;
1186 break;
1187
1188 case type_char:
1189 search_states_ptr->type = compiler_common->terms_size++;
1190 break;
1191
1192 case type_newline:
1193 if (transitions_ptr->value)
1194 search_states_ptr->type = 1;
1195 else
1196 search_states_ptr->type = compiler_common->terms_size++;
1197 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1198 break;
1199
1200 case type_id:
1201 if (transitions_ptr->value > 0)
1202 compiler_common->flags |= REGEX_ID_CHECK;
1203 search_states_ptr->type = -1;
1204 break;
1205
1206 case type_rng_start:
1207 search_states_ptr->type = compiler_common->terms_size;
1208 rng_start = search_states_ptr;
1209 break;
1210
1211 case type_rng_end:
1212 search_states_ptr->type = compiler_common->terms_size++;
1213 /* Ok, this is a blunt over estimation :) */
1214 if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1215 compiler_common->longest_range_size = search_states_ptr - rng_start;
1216 break;
1217
1218 default:
1219 search_states_ptr->type = -1;
1220 break;
1221 }
1222 search_states_ptr->value = -1;
1223 search_states_ptr++;
1224 transitions_ptr++;
1225 }
1226 return REGEX_NO_ERROR;
1227 }
1228
1229 static int trace_transitions(int from, struct compiler_common *compiler_common)
1230 {
1231 int id = 0;
1232 struct stack *stack = &compiler_common->stack;
1233 struct stack *depth = &compiler_common->depth;
1234 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1235 struct stack_item *search_states = compiler_common->search_states;
1236
1237 SLJIT_ASSERT(search_states[from].type >= 0);
1238
1239 from++;
1240
1241 /* Be prepared for any paths (loops, etc). */
1242 while (1) {
1243 if (dfa_transitions[from].type == type_id)
1244 if (id < dfa_transitions[from].value)
1245 id = dfa_transitions[from].value;
1246
1247 if (search_states[from].value < id) {
1248 /* Forward step. */
1249 if (search_states[from].value == -1)
1250 if (stack_push(stack, 0, from))
1251 return REGEX_MEMORY_ERROR;
1252 search_states[from].value = id;
1253
1254 if (dfa_transitions[from].type == type_branch) {
1255 if (stack_push(depth, id, from))
1256 return REGEX_MEMORY_ERROR;
1257 from++;
1258 continue;
1259 }
1260 else if (dfa_transitions[from].type == type_jump) {
1261 from = dfa_transitions[from].value;
1262 continue;
1263 }
1264 else if (search_states[from].type < 0) {
1265 from++;
1266 continue;
1267 }
1268 }
1269
1270 /* Back tracking. */
1271 if (depth->count > 0) {
1272 id = stack_top(depth)->type;
1273 from = dfa_transitions[stack_pop(depth)->value].value;
1274 continue;
1275 }
1276 return 0;
1277 }
1278 }
1279
1280 /* --------------------------------------------------------------------- */
1281 /* Code generator */
1282 /* --------------------------------------------------------------------- */
1283
1284 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_w))
1285 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_w)))
1286
1287 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1288 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1289
1290 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1291 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1292
1293 #define EMIT_LABEL(label) \
1294 label = sljit_emit_label(compiler); \
1295 CHECK(!label)
1296
1297 #define EMIT_JUMP(jump, type) \
1298 jump = sljit_emit_jump(compiler, type); \
1299 CHECK(!jump)
1300
1301 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1302 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1303 CHECK(!jump)
1304
1305 /* CHECK depends on the use case. */
1306
1307 #define CHECK(exp) \
1308 if (SLJIT_UNLIKELY(exp)) \
1309 return REGEX_MEMORY_ERROR
1310
1311 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1312 {
1313 struct sljit_compiler *compiler = compiler_common->compiler;
1314 struct stack *stack = &compiler_common->stack;
1315 struct stack_item *search_states = compiler_common->search_states;
1316 int flags = compiler_common->flags;
1317 sljit_w no_states = compiler_common->no_states;
1318 sljit_uw head = 0;
1319 sljit_w offset, value;
1320
1321 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1322 CHECK(trace_transitions(0, compiler_common));
1323 }
1324 else {
1325 CHECK(trace_transitions(1, compiler_common));
1326 }
1327
1328 while (stack->count > 0) {
1329 value = stack_pop(stack)->value;
1330 if (search_states[value].type >= 0) {
1331 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1332 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1333 if (offset > 0)
1334 head = offset;
1335
1336 if (!(flags & REGEX_MATCH_BEGIN)) {
1337 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1338 if (flags & REGEX_ID_CHECK) {
1339 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1340 }
1341 }
1342 else if (flags & REGEX_ID_CHECK) {
1343 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1344 }
1345 }
1346 search_states[value].value = -1;
1347 }
1348 if (reg == R_NEXT_STATE) {
1349 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1350 }
1351 else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1352 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1353 offset = TERM_OFFSET_OF(search_states[1].type, 0);
1354
1355 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1356
1357 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1358 head = offset;
1359
1360 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1361 if (flags & REGEX_ID_CHECK) {
1362 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1363 }
1364 }
1365 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1366 return REGEX_NO_ERROR;
1367 }
1368
1369 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_w curr_index)
1370 {
1371 struct sljit_compiler *compiler = compiler_common->compiler;
1372 struct stack *stack = &compiler_common->stack;
1373 struct stack_item *search_states = compiler_common->search_states;
1374 sljit_w offset;
1375 int flags;
1376 sljit_w no_states;
1377 sljit_w value;
1378 struct sljit_jump *jump1;
1379 struct sljit_jump *jump2;
1380 struct sljit_jump *jump3;
1381 struct sljit_jump *jump4;
1382 struct sljit_jump *jump5;
1383 struct sljit_label *label1;
1384
1385 flags = compiler_common->flags;
1386 no_states = compiler_common->no_states;
1387
1388 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1389 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1390 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1391 }
1392
1393 while (stack->count > 0) {
1394 value = stack_pop(stack)->value;
1395 if (search_states[value].type >= 0) {
1396 #ifdef REGEX_MATCH_VERBOSE
1397 if (flags & REGEX_MATCH_VERBOSE)
1398 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1399 #endif
1400 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1401
1402 if (!(flags & REGEX_ID_CHECK)) {
1403 if (!(flags & REGEX_MATCH_BEGIN)) {
1404 /* Check whether item is inserted. */
1405 EMIT_CMP(jump1, SLJIT_C_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), SLJIT_IMM, -1);
1406 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), R_NEXT_HEAD, 0);
1407 if (offset > 0) {
1408 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1409 }
1410 EMIT_JUMP(jump2, SLJIT_JUMP);
1411
1412 /* Check whether old index <= index. */
1413 EMIT_LABEL(label1);
1414 sljit_set_label(jump1, label1);
1415
1416 EMIT_CMP(jump1, SLJIT_C_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_w), R_TEMP, 0);
1417
1418 EMIT_LABEL(label1);
1419 sljit_set_label(jump2, label1);
1420 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_w), R_TEMP, 0);
1421
1422 EMIT_LABEL(label1);
1423 sljit_set_label(jump1, label1);
1424 }
1425 else {
1426 /* Check whether item is inserted. */
1427 EMIT_CMP(jump1, SLJIT_C_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), SLJIT_IMM, -1);
1428 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), R_NEXT_HEAD, 0);
1429 if (offset > 0) {
1430 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1431 }
1432 EMIT_LABEL(label1);
1433 sljit_set_label(jump1, label1);
1434 }
1435 }
1436 else {
1437 if (!(flags & REGEX_MATCH_BEGIN)) {
1438 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1439
1440 /* Check whether item is inserted. */
1441 EMIT_CMP(jump1, SLJIT_C_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), SLJIT_IMM, -1);
1442 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), R_NEXT_HEAD, 0);
1443 if (offset > 0) {
1444 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1445 }
1446 EMIT_JUMP(jump2, SLJIT_JUMP);
1447
1448 /* Check whether old index != index. */
1449 EMIT_LABEL(label1);
1450 sljit_set_label(jump1, label1);
1451
1452 EMIT_OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_w), R_TEMP, 0);
1453 EMIT_JUMP(jump1, SLJIT_C_LESS);
1454 EMIT_JUMP(jump3, SLJIT_C_GREATER);
1455
1456 /* Old index == index. */
1457 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1458 if (search_states[value].value > 0) {
1459 EMIT_CMP(jump4, SLJIT_C_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1460
1461 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1462 EMIT_LABEL(label1);
1463 sljit_set_label(jump4, label1);
1464 }
1465
1466 EMIT_OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_w), R_TEMP, 0);
1467 EMIT_JUMP(jump4, SLJIT_C_GREATER_EQUAL);
1468 EMIT_JUMP(jump5, SLJIT_JUMP);
1469
1470 /* Overwrite index & id. */
1471 EMIT_LABEL(label1);
1472 sljit_set_label(jump3, label1);
1473 sljit_set_label(jump2, label1);
1474 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_w), R_TEMP, 0);
1475
1476 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1477 if (search_states[value].value > 0) {
1478 EMIT_CMP(jump3, SLJIT_C_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1479
1480 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1481 EMIT_LABEL(label1);
1482 sljit_set_label(jump3, label1);
1483 }
1484
1485 EMIT_LABEL(label1);
1486 sljit_set_label(jump5, label1);
1487 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_w), R_TEMP, 0);
1488
1489 /* Exit. */
1490 EMIT_LABEL(label1);
1491 sljit_set_label(jump1, label1);
1492 sljit_set_label(jump4, label1);
1493 }
1494 else {
1495 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1496
1497 if (search_states[value].value > 0) {
1498 EMIT_CMP(jump1, SLJIT_C_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1499
1500 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1501 EMIT_LABEL(label1);
1502 sljit_set_label(jump1, label1);
1503 }
1504
1505 /* Check whether item is inserted. */
1506 EMIT_CMP(jump1, SLJIT_C_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), SLJIT_IMM, -1);
1507 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_w), R_NEXT_HEAD, 0);
1508 if (offset > 0) {
1509 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1510 }
1511 EMIT_JUMP(jump2, SLJIT_JUMP);
1512
1513 /* Check whether old id >= id. */
1514 EMIT_LABEL(label1);
1515 sljit_set_label(jump1, label1);
1516
1517 EMIT_CMP(jump1, SLJIT_C_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_w), R_TEMP, 0);
1518
1519 EMIT_LABEL(label1);
1520 sljit_set_label(jump2, label1);
1521 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_w), R_TEMP, 0);
1522
1523 EMIT_LABEL(label1);
1524 sljit_set_label(jump1, label1);
1525 }
1526 }
1527 }
1528 search_states[value].value = -1;
1529 }
1530
1531 #ifdef REGEX_MATCH_VERBOSE
1532 if (flags & REGEX_MATCH_VERBOSE)
1533 printf("\n");
1534 #endif
1535 return REGEX_NO_ERROR;
1536 }
1537
1538 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1539 {
1540 struct sljit_compiler *compiler = compiler_common->compiler;
1541 struct sljit_jump *jump;
1542 struct sljit_jump *clear_states_jump;
1543 struct sljit_label *label;
1544 struct sljit_label *leave_label;
1545 struct sljit_label *begin_loop_label;
1546
1547 /* Priority order: best_begin > best_end > best_id.
1548 In other words:
1549 if (new best_begin > old test_begin) do nothing
1550 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1551 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1552
1553 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1554
1555 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1556 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1557 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_C_LESS : SLJIT_C_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1558 sljit_set_label(jump, end_check_label);
1559
1560 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1561 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1562 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1563 }
1564 else {
1565 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1566 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1567 }
1568 else {
1569 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1570 }
1571 }
1572 if (compiler_common->flags & REGEX_ID_CHECK) {
1573 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1574 }
1575
1576 EMIT_CMP(clear_states_jump, SLJIT_C_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1577
1578 EMIT_LABEL(leave_label);
1579 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1580 EMIT_JUMP(jump, SLJIT_JUMP);
1581 sljit_set_label(jump, end_check_label);
1582
1583 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1584 EMIT_LABEL(label);
1585 sljit_set_label(clear_states_jump, label);
1586
1587 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1588 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1589
1590 /* Begin of the loop. */
1591 EMIT_LABEL(begin_loop_label);
1592 EMIT_CMP(jump, SLJIT_C_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1593 sljit_set_label(jump, leave_label);
1594
1595 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1596 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_w));
1597 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_C_GREATER : SLJIT_C_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_w), R_CURR_CHAR, 0);
1598
1599 /* Case 1: keep this case. */
1600 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_w), R_NEXT_HEAD, 0);
1601 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1602
1603 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1604 EMIT_JUMP(jump, SLJIT_JUMP);
1605 sljit_set_label(jump, begin_loop_label);
1606
1607 /* Case 2: remove this case. */
1608 EMIT_LABEL(label);
1609 sljit_set_label(clear_states_jump, label);
1610
1611 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_w), SLJIT_IMM, -1);
1612
1613 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1614 EMIT_JUMP(jump, SLJIT_JUMP);
1615 sljit_set_label(jump, begin_loop_label);
1616 }
1617 else {
1618 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1619 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1620 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1621 if (compiler_common->flags & REGEX_ID_CHECK) {
1622 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1623 }
1624 EMIT_JUMP(jump, SLJIT_JUMP);
1625 sljit_set_label(jump, end_check_label);
1626 }
1627 return REGEX_NO_ERROR;
1628 }
1629
1630 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1631 {
1632 struct sljit_compiler *compiler = compiler_common->compiler;
1633 struct stack *stack = &compiler_common->stack;
1634 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1635 struct stack_item *search_states = compiler_common->search_states;
1636 int ind;
1637 struct sljit_jump *jump;
1638 int init_range = 1, prev_value = 0;
1639
1640 while (stack->count > 0) {
1641 ind = stack_pop(stack)->value;
1642 search_states[ind].value = -1;
1643 if (search_states[ind].type >= 0) {
1644 if (dfa_transitions[ind].type == type_char) {
1645 EMIT_CMP(jump, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1646 sljit_set_label(jump, fast_forward_label);
1647 }
1648 else if (dfa_transitions[ind].type == type_rng_start) {
1649 SLJIT_ASSERT(!dfa_transitions[ind].value);
1650 ind++;
1651 while (dfa_transitions[ind].type != type_rng_end) {
1652 if (dfa_transitions[ind].type == type_rng_char) {
1653 EMIT_CMP(jump, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1654 sljit_set_label(jump, fast_forward_label);
1655 }
1656 else {
1657 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1658 if (init_range) {
1659 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1660 init_range = 0;
1661 }
1662 if (dfa_transitions[ind].value != prev_value) {
1663 /* Best compatibility to all archs. */
1664 prev_value -= dfa_transitions[ind].value;
1665 if (prev_value < 0) {
1666 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1667 }
1668 else {
1669 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1670 }
1671 prev_value = dfa_transitions[ind].value;
1672 }
1673 EMIT_CMP(jump, SLJIT_C_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1674 sljit_set_label(jump, fast_forward_label);
1675 ind++;
1676 }
1677 ind++;
1678 }
1679 }
1680 else {
1681 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1682 EMIT_CMP(jump, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1683 sljit_set_label(jump, fast_forward_label);
1684 EMIT_CMP(jump, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1685 sljit_set_label(jump, fast_forward_label);
1686 }
1687 }
1688 }
1689 return REGEX_NO_ERROR;
1690 }
1691
1692 static int compile_newline_check(struct compiler_common *compiler_common, sljit_w ind)
1693 {
1694 struct sljit_compiler *compiler = compiler_common->compiler;
1695 struct sljit_jump *jump1;
1696 struct sljit_jump *jump2;
1697 struct sljit_label *label;
1698 sljit_w no_states;
1699 sljit_w offset;
1700
1701 /* Check whether a new-line character is found. */
1702 EMIT_CMP(jump1, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1703 EMIT_CMP(jump2, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1704
1705 no_states = compiler_common->no_states;
1706 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1707 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1708 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1709 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1710
1711 EMIT_LABEL(label);
1712 sljit_set_label(jump1, label);
1713 sljit_set_label(jump2, label);
1714 return REGEX_NO_ERROR;
1715 }
1716
1717 #undef CHECK
1718
1719 #define CHECK(exp) \
1720 if (SLJIT_UNLIKELY(exp)) \
1721 return 0
1722
1723 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1724 {
1725 while (*range_jump_list) {
1726 sljit_set_label(*range_jump_list, label);
1727 range_jump_list++;
1728 }
1729 }
1730
1731 static sljit_w compile_range_check(struct compiler_common *compiler_common, sljit_w ind)
1732 {
1733 struct sljit_compiler *compiler = compiler_common->compiler;
1734 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1735 struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1736 int invert = dfa_transitions[ind].value;
1737 struct sljit_label *label;
1738 sljit_w no_states;
1739 sljit_w offset;
1740 int init_range = 1, prev_value = 0;
1741
1742 ind++;
1743
1744 while (dfa_transitions[ind].type != type_rng_end) {
1745 if (dfa_transitions[ind].type == type_rng_char) {
1746 EMIT_CMP(*range_jump_list, SLJIT_C_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1747 range_jump_list++;
1748 }
1749 else {
1750 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1751 if (init_range) {
1752 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1753 init_range = 0;
1754 }
1755 if (dfa_transitions[ind].value != prev_value) {
1756 /* Best compatibility to all archs. */
1757 prev_value -= dfa_transitions[ind].value;
1758 if (prev_value < 0) {
1759 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1760 }
1761 else {
1762 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1763 }
1764 prev_value = dfa_transitions[ind].value;
1765 }
1766 EMIT_CMP(*range_jump_list, SLJIT_C_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1767 range_jump_list++;
1768 ind++;
1769 }
1770 ind++;
1771 }
1772
1773 *range_jump_list = NULL;
1774
1775 if (!invert) {
1776 no_states = compiler_common->no_states;
1777 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1778 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1779 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1780 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1781
1782 EMIT_LABEL(label);
1783 range_set_label(compiler_common->range_jump_list, label);
1784 /* Clears the jump list. */
1785 *compiler_common->range_jump_list = NULL;
1786 }
1787 return ind;
1788 }
1789
1790 #undef TERM_OFFSET_OF
1791 #undef EMIT_OP1
1792 #undef EMIT_OP2
1793 #undef EMIT_LABEL
1794 #undef EMIT_JUMP
1795 #undef EMIT_CMP
1796 #undef CHECK
1797
1798 /* --------------------------------------------------------------------- */
1799 /* Main compiler */
1800 /* --------------------------------------------------------------------- */
1801
1802 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_w))
1803
1804 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1805 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1806
1807 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1808 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1809
1810 #define EMIT_LABEL(label) \
1811 label = sljit_emit_label(compiler_common.compiler); \
1812 CHECK(!label)
1813
1814 #define EMIT_JUMP(jump, type) \
1815 jump = sljit_emit_jump(compiler_common.compiler, type); \
1816 CHECK(!jump)
1817
1818 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1819 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1820 CHECK(!jump)
1821
1822 /* A do {} while(0) expression helps to avoid goto statements. */
1823 #define BEGIN_GUARD \
1824 do {
1825
1826 #define END_GUARD \
1827 } while(0);
1828
1829 #define CHECK(exp) \
1830 if (SLJIT_UNLIKELY(exp)) \
1831 break;
1832
1833 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1834 {
1835 struct compiler_common compiler_common;
1836 sljit_w ind;
1837 int error_code, done, suggest_fast_forward;
1838 /* ID of an empty match (-1 if not reachable). */
1839 int empty_match_id;
1840
1841 struct sljit_jump *jump;
1842 struct sljit_jump *best_match_found_jump;
1843 struct sljit_jump *fast_forward_jump = NULL;
1844 struct sljit_jump *length_is_zero_jump;
1845 struct sljit_jump *end_check_jump = NULL;
1846 struct sljit_jump *best_match_check_jump = NULL;
1847 struct sljit_jump *non_greedy_end_jump = NULL;
1848 struct sljit_label *label;
1849 struct sljit_label *end_check_label = NULL;
1850 struct sljit_label *start_label;
1851 struct sljit_label *fast_forward_label;
1852 struct sljit_label *fast_forward_return_label;
1853
1854 if (error)
1855 *error = REGEX_NO_ERROR;
1856 #ifdef REGEX_MATCH_VERBOSE
1857 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1858 #else
1859 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1860 #endif
1861
1862 /* Step 1: parsing (Left->Right).
1863 Syntax check and AST generator. */
1864 error_code = parse(regex_string, length, &compiler_common);
1865 if (error_code) {
1866 stack_destroy(&compiler_common.stack);
1867 if (error)
1868 *error = error_code;
1869 return NULL;
1870 }
1871
1872 /* Step 2: generating branches (Right->Left). */
1873 error_code = generate_transitions(&compiler_common);
1874 stack_destroy(&compiler_common.stack);
1875 stack_destroy(&compiler_common.depth);
1876 if (error_code) {
1877 if (compiler_common.dfa_transitions)
1878 SLJIT_FREE(compiler_common.dfa_transitions);
1879 if (error)
1880 *error = error_code;
1881 return NULL;
1882 }
1883
1884 /* Step 3: Generate necessary data for depth-first search (Left->Right). */
1885 error_code = generate_search_states(&compiler_common);
1886 if (error_code) {
1887 SLJIT_FREE(compiler_common.dfa_transitions);
1888 if (error)
1889 *error = error_code;
1890 return NULL;
1891 }
1892
1893 #ifdef REGEX_MATCH_VERBOSE
1894 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1895 verbose_transitions(&compiler_common);
1896 #endif
1897
1898 /* Step 4: Left->Right generate code. */
1899 stack_init(&compiler_common.stack);
1900 stack_init(&compiler_common.depth);
1901 done = 0;
1902 compiler_common.machine = NULL;
1903 compiler_common.compiler = NULL;
1904 compiler_common.range_jump_list = NULL;
1905
1906 BEGIN_GUARD
1907
1908 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw));
1909 CHECK(!compiler_common.machine);
1910
1911 compiler_common.compiler = sljit_create_compiler();
1912 CHECK(!compiler_common.compiler);
1913
1914 if (compiler_common.longest_range_size > 0) {
1915 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size);
1916 CHECK(!compiler_common.range_jump_list);
1917 }
1918
1919 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1920 compiler_common.no_states = 4;
1921 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1922 compiler_common.no_states = 2;
1923 else
1924 compiler_common.no_states = 3;
1925
1926 compiler_common.machine->flags = compiler_common.flags;
1927 compiler_common.machine->no_states = compiler_common.no_states;
1928 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1929
1930 /* Study the regular expression. */
1931 empty_match_id = -1;
1932 suggest_fast_forward = 1;
1933 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1934 CHECK(trace_transitions(0, &compiler_common));
1935 while (compiler_common.stack.count > 0) {
1936 ind = stack_pop(&compiler_common.stack)->value;
1937 if (compiler_common.search_states[ind].type == 0) {
1938 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1939 suggest_fast_forward = 0;
1940 empty_match_id = compiler_common.search_states[ind].value;
1941 }
1942 else if (compiler_common.search_states[ind].type > 0) {
1943 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1944 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1945 suggest_fast_forward = 0;
1946 }
1947 compiler_common.search_states[ind].value = -1;
1948 }
1949 }
1950 else {
1951 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1952 CHECK(trace_transitions(1, &compiler_common));
1953 while (compiler_common.stack.count > 0) {
1954 ind = stack_pop(&compiler_common.stack)->value;
1955 if (compiler_common.search_states[ind].type == 0) {
1956 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1957 suggest_fast_forward = 0;
1958 empty_match_id = compiler_common.search_states[ind].value;
1959 }
1960 compiler_common.search_states[ind].value = -1;
1961 }
1962 }
1963
1964 /* Step 4.1: Generate entry. */
1965 CHECK(sljit_emit_enter(compiler_common.compiler, 3, 5, 5, 0));
1966
1967 /* Copy arguments to their place. */
1968 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_SAVED_REG1, 0);
1969 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_SAVED_REG2, 0);
1970 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_SAVED_REG3, 0, SLJIT_IMM, 1);
1971
1972 /* Init global registers. */
1973 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1974 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1975 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1976 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1977 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1978
1979 /* Check whether the best match has already found in a previous frame. */
1980 EMIT_CMP(jump, SLJIT_C_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1981 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1982
1983 #ifdef REGEX_MATCH_VERBOSE
1984 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1985 printf("\n-----------------\nTrace\n-----------------\n");
1986 #endif
1987
1988 /* Step 4.2: Generate code for state 0. */
1989 EMIT_LABEL(label);
1990 compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1991
1992 /* Swapping current and next. */
1993 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
1994 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
1995 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
1996
1997 /* Checking whether the best case needs to be updated. */
1998 if (!(compiler_common.flags & REGEX_MATCH_END)) {
1999 EMIT_CMP(end_check_jump, SLJIT_C_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2000 EMIT_LABEL(end_check_label);
2001 }
2002 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2003 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2004
2005 /* Checking whether best case has already found. */
2006 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2007 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2008 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2009 EMIT_CMP(best_match_check_jump, SLJIT_C_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2010 }
2011 else {
2012 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2013 EMIT_CMP(best_match_check_jump, SLJIT_C_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2014 }
2015 }
2016
2017 EMIT_LABEL(start_label);
2018 sljit_set_label(jump, start_label);
2019
2020 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2021 EMIT_CMP(fast_forward_jump, SLJIT_C_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2022 }
2023
2024 /* Loading the next character. */
2025 EMIT_OP2(SLJIT_SUB | SLJIT_SET_E, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2026 EMIT_JUMP(length_is_zero_jump, SLJIT_C_EQUAL);
2027
2028 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2029 #ifdef REGEX_USE_8BIT_CHARS
2030 EMIT_OP1(SLJIT_MOV_UB, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2031 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2032 #else
2033 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2034 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2035 #endif
2036 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2037
2038 #ifdef REGEX_MATCH_VERBOSE
2039 if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2040 printf("(%3d): ", 0);
2041 CHECK(trace_transitions(0, &compiler_common));
2042 while (compiler_common.stack.count > 0) {
2043 ind = stack_pop(&compiler_common.stack)->value;
2044 if (compiler_common.search_states[ind].type >= 0)
2045 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2046 compiler_common.search_states[ind].value = -1;
2047 }
2048 printf("\n");
2049 }
2050 #endif
2051
2052 EMIT_LABEL(fast_forward_return_label);
2053 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2054 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2055 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2056 EMIT_CMP(jump, SLJIT_C_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2057 }
2058
2059 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2060 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2061 /* And branching to the first state. */
2062 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2063
2064 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2065 EMIT_LABEL(label);
2066 sljit_set_label(jump, label);
2067 }
2068 }
2069 /* This is the case where we only have to reset the R_NEXT_HEAD. */
2070 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2071 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2072 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2073
2074 /* Fast-forward loop. */
2075 if (fast_forward_jump) {
2076 /* Quit from fast-forward loop. */
2077 EMIT_LABEL(fast_forward_label);
2078 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2079 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2080 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2081 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2082 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2083 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2084 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2085
2086 /* Update the start field of the locations. */
2087 CHECK(trace_transitions(0, &compiler_common));
2088 while (compiler_common.stack.count > 0) {
2089 ind = stack_pop(&compiler_common.stack)->value;
2090 if (compiler_common.search_states[ind].type >= 0) {
2091 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2092 }
2093 compiler_common.search_states[ind].value = -1;
2094 }
2095 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2096 EMIT_JUMP(jump, SLJIT_JUMP);
2097 sljit_set_label(jump, fast_forward_return_label);
2098
2099 /* Start fast-forward. */
2100 EMIT_LABEL(label);
2101 sljit_set_label(fast_forward_jump, label);
2102
2103 /* Moving everything to registers. */
2104 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2105 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2106 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2107 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2108 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2109 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2110
2111 /* Fast forward mainloop. */
2112 EMIT_LABEL(label);
2113 EMIT_OP2(SLJIT_SUB | SLJIT_SET_E, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2114 EMIT_JUMP(fast_forward_jump, SLJIT_C_EQUAL);
2115
2116 #ifdef REGEX_USE_8BIT_CHARS
2117 EMIT_OP1(SLJIT_MOV_UB, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2118 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2119 #else
2120 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2121 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2122 #endif
2123
2124 CHECK(trace_transitions(0, &compiler_common));
2125 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2126
2127 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2128 EMIT_JUMP(jump, SLJIT_JUMP);
2129 sljit_set_label(jump, label);
2130
2131 /* String is finished. */
2132 EMIT_LABEL(label);
2133 sljit_set_label(fast_forward_jump, label);
2134 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2135 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2136 }
2137
2138 /* End check. */
2139 if (end_check_jump) {
2140 EMIT_LABEL(label);
2141 sljit_set_label(end_check_jump, label);
2142
2143 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2144 CHECK(compile_end_check(&compiler_common, end_check_label));
2145 }
2146 else {
2147 /* Since we leave, we do not need to update the R_BEST_BEGIN. */
2148 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2149 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2150 if (compiler_common.flags & REGEX_ID_CHECK) {
2151 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2152 }
2153 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2154 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2155 }
2156 }
2157
2158 /* Finish check. */
2159 if (best_match_check_jump) {
2160 EMIT_LABEL(label);
2161 sljit_set_label(best_match_check_jump, label);
2162
2163 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2164 EMIT_CMP(jump, SLJIT_C_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2165 sljit_set_label(jump, start_label);
2166 }
2167 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2168 }
2169
2170 /* Leaving matching and storing the necessary values. */
2171 EMIT_LABEL(label);
2172 sljit_set_label(length_is_zero_jump, label);
2173 if (non_greedy_end_jump)
2174 sljit_set_label(non_greedy_end_jump, label);
2175
2176 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2177 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2178 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2179 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2180
2181 /* Exit from JIT. */
2182 EMIT_LABEL(label);
2183 sljit_set_label(best_match_found_jump, label);
2184 if (fast_forward_jump)
2185 sljit_set_label(fast_forward_jump, label);
2186 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0));
2187
2188 ind = 1;
2189 while (ind < compiler_common.dfa_size - 1) {
2190 if (compiler_common.search_states[ind].type >= 0) {
2191 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2192 EMIT_LABEL(label);
2193 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2194
2195 if (compiler_common.dfa_transitions[ind].type == type_char) {
2196 EMIT_CMP(jump, SLJIT_C_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2197 }
2198 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2199 ind = compile_range_check(&compiler_common, ind);
2200 CHECK(!ind);
2201 }
2202 else {
2203 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2204 CHECK(compile_newline_check(&compiler_common, ind));
2205 }
2206
2207 CHECK(trace_transitions(ind, &compiler_common));
2208 #ifdef REGEX_MATCH_VERBOSE
2209 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2210 printf("(%3d): ", compiler_common.search_states[ind].type);
2211 #endif
2212 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2213
2214 if (compiler_common.dfa_transitions[ind].type == type_char) {
2215 EMIT_LABEL(label);
2216 sljit_set_label(jump, label);
2217 }
2218 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2219 EMIT_LABEL(label);
2220 range_set_label(compiler_common.range_jump_list, label);
2221 }
2222 else {
2223 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2224 }
2225
2226 /* Branch to the next item in the list. */
2227 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2228 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2229 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2230 }
2231 ind++;
2232 }
2233
2234 if (ind == compiler_common.dfa_size - 1) {
2235 /* Generate an init stub function. */
2236 EMIT_LABEL(label);
2237 CHECK(sljit_emit_enter(compiler_common.compiler, 2, 3, 3, 0));
2238
2239 if (empty_match_id == -1) {
2240 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SAVED_REG2), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2241 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SAVED_REG2), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2242 }
2243 else {
2244 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SAVED_REG2), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2245 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SAVED_REG2), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2246 }
2247
2248 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SAVED_REG2), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2249
2250 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2251 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2252 SLJIT_ASSERT(R_CURR_STATE == SLJIT_SAVED_REG1);
2253 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2254 /* R_CURR_INDEX (put to R_TEMP) is zero. */
2255 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2256 }
2257 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2258 }
2259 else {
2260 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2261 }
2262 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2263
2264 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2265 #ifndef SLJIT_INDIRECT_CALL
2266 compiler_common.machine->u.init_match = (void*)(sljit_w)sljit_get_label_addr(label);
2267 #else
2268 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2269 #endif
2270 #ifdef REGEX_MATCH_VERBOSE
2271 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2272 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2273 #endif
2274 if (compiler_common.machine->continue_match) {
2275 for (ind = 0; ind < compiler_common.terms_size; ++ind)
2276 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2277 done = 1;
2278 }
2279 }
2280 END_GUARD
2281
2282 stack_destroy(&compiler_common.stack);
2283 stack_destroy(&compiler_common.depth);
2284 SLJIT_FREE(compiler_common.dfa_transitions);
2285 SLJIT_FREE(compiler_common.search_states);
2286 if (compiler_common.range_jump_list)
2287 SLJIT_FREE(compiler_common.range_jump_list);
2288 if (compiler_common.compiler)
2289 sljit_free_compiler(compiler_common.compiler);
2290 if (done)
2291 return compiler_common.machine;
2292
2293 if (compiler_common.machine) {
2294 SLJIT_FREE(compiler_common.machine);
2295 }
2296 if (error)
2297 *error = REGEX_MEMORY_ERROR;
2298 return NULL;
2299 }
2300
2301 #undef TERM_OFFSET_OF
2302 #undef EMIT_OP1
2303 #undef EMIT_OP2
2304 #undef EMIT_LABEL
2305 #undef EMIT_JUMP
2306 #undef EMIT_CMP
2307 #undef BEGIN_GUARD
2308 #undef END_GUARD
2309 #undef CHECK
2310
2311 void regex_free_machine(struct regex_machine *machine)
2312 {
2313 sljit_free_code(machine->continue_match);
2314 SLJIT_FREE(machine);
2315 }
2316
2317 const char* regex_get_platform_name(void)
2318 {
2319 return sljit_get_platform_name();
2320 }
2321
2322 /* --------------------------------------------------------------------- */
2323 /* Mathching utilities */
2324 /* --------------------------------------------------------------------- */
2325
2326 struct regex_match* regex_begin_match(struct regex_machine *machine)
2327 {
2328 sljit_w *ptr1;
2329 sljit_w *ptr2;
2330 sljit_w *end;
2331 sljit_w *entry_addrs;
2332
2333 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_w));
2334 if (!match)
2335 return NULL;
2336
2337 ptr1 = match->states;
2338 ptr2 = match->states + machine->size;
2339 end = ptr2;
2340 entry_addrs = (sljit_w*)machine->entry_addrs;
2341
2342 match->current = ptr1;
2343 match->next = ptr2;
2344 match->head = 0;
2345 match->machine = machine;
2346
2347 /* Init machine states. */
2348 switch (machine->no_states) {
2349 case 2:
2350 while (ptr1 < end) {
2351 *ptr1++ = *entry_addrs;
2352 *ptr2++ = *entry_addrs++;
2353 *ptr1++ = -1;
2354 *ptr2++ = -1;
2355 }
2356 break;
2357
2358 case 3:
2359 while (ptr1 < end) {
2360 *ptr1++ = *entry_addrs;
2361 *ptr2++ = *entry_addrs++;
2362 *ptr1++ = -1;
2363 *ptr2++ = -1;
2364 *ptr1++ = 0;
2365 *ptr2++ = 0;
2366 }
2367 break;
2368
2369 case 4:
2370 while (ptr1 < end) {
2371 *ptr1++ = *entry_addrs;
2372 *ptr2++ = *entry_addrs++;
2373 *ptr1++ = -1;
2374 *ptr2++ = -1;
2375 *ptr1++ = 0;
2376 *ptr2++ = 0;
2377 *ptr1++ = 0;
2378 *ptr2++ = 0;
2379 }
2380 break;
2381
2382 default:
2383 SLJIT_ASSERT_STOP();
2384 break;
2385 }
2386
2387 SLJIT_ASSERT(ptr1 == end);
2388
2389 match->u.continue_match = machine->continue_match;
2390
2391 regex_reset_match(match);
2392 return match;
2393 }
2394
2395 void regex_reset_match(struct regex_match *match)
2396 {
2397 struct regex_machine *machine = match->machine;
2398 sljit_w current, ind;
2399 sljit_w *current_ptr;
2400
2401 match->best_end = 0;
2402 match->fast_quit = 0;
2403 match->fast_forward = 0;
2404
2405 if (match->head != 0) {
2406 /* Clear the current state. */
2407 current = match->head;
2408 current_ptr = match->current;
2409 do {
2410 ind = (current / sizeof(sljit_w)) + 1;
2411 current = current_ptr[ind];
2412 current_ptr[ind] = -1;
2413 } while (current != 0);
2414 }
2415 match->head = machine->u.call_init(match->current, match);
2416 }
2417
2418 void regex_free_match(struct regex_match *match)
2419 {
2420 SLJIT_FREE(match);
2421 }
2422
2423 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2424 {
2425 match->u.call_continue(match, input_string, length);
2426 }
2427
2428 int regex_get_result(struct regex_match *match, int *end, int *id)
2429 {
2430 int flags = match->machine->flags;
2431 sljit_w no_states;
2432
2433 *end = match->best_end;
2434 *id = match->best_id;
2435 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2436 return match->best_begin;
2437
2438 if (flags & REGEX_FAKE_MATCH_END) {
2439 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2440 if (match->best_begin != -1)
2441 return match->best_begin;
2442
2443 no_states = match->machine->no_states;
2444 if (match->current[no_states + 1] == -1)
2445 return -1;
2446 if (flags & REGEX_ID_CHECK)
2447 *id = match->current[no_states + 3];
2448 if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2449 *end = match->index - 1;
2450 else
2451 *end = match->index - 2;
2452 return match->current[no_states + 2];
2453 }
2454 else {
2455 /* Check the status of the last code. */
2456 if (!(flags & REGEX_MATCH_BEGIN)) {
2457 /* No shortcut in this case. */
2458 if (!(flags & REGEX_ID_CHECK)) {
2459 if (match->current[1] == -1)
2460 return -1;
2461 *end = match->index - 1;
2462 return match->current[2];
2463 }
2464
2465 if (match->current[1] == -1)
2466 return -1;
2467 *end = match->index - 1;
2468 *id = match->current[3];
2469 return match->current[2];
2470 }
2471
2472 /* Shortcut is possible in this case. */
2473 if (!(flags & REGEX_ID_CHECK)) {
2474 if (match->current[1] == -1 || match->head == -1)
2475 return -1;
2476 *end = match->index - 1;
2477 return 0;
2478 }
2479
2480 if (match->current[1] == -1 || match->head == -1)
2481 return -1;
2482 *end = match->index - 1;
2483 *id = match->current[2];
2484 return 0;
2485 }
2486 }
2487
2488 int regex_is_match_finished(struct regex_match *match)
2489 {
2490 return match->fast_quit;
2491 }
2492
2493 #ifdef REGEX_MATCH_VERBOSE
2494 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2495 {
2496 sljit_w *ptr;
2497 sljit_w *end;
2498 sljit_w count;
2499 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2500 sljit_w current;
2501 #endif
2502 sljit_w no_states = match->machine->no_states;
2503 sljit_w len = match->machine->size;
2504
2505 while (length > 0) {
2506 match->u.call_continue(match, input_string, 1);
2507
2508 if (match->fast_forward) {
2509 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2510 printf("fast forward\n");
2511 }
2512
2513 /* Verbose (first). */
2514 if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2515 ptr = match->current;
2516 end = ptr + len;
2517 count = 0;
2518 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2519 while (ptr < end) {
2520 printf("[%3ld:", (long)count++);
2521 switch (no_states) {
2522 case 2:
2523 if (ptr[1] != -1)
2524 printf("+] ");
2525 else
2526 printf(" ] ");
2527 break;
2528
2529 case 3:
2530 if (ptr[1] != -1)
2531 printf("+,%3ld] ", (long)ptr[2]);
2532 else
2533 printf(" ,XXX] ");
2534 break;
2535
2536 case 4:
2537 if (ptr[1] != -1)
2538 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2539 else
2540 printf(" ,XXX,XXX] ");
2541 break;
2542 }
2543 ptr += no_states;
2544 }
2545 printf("\n");
2546 }
2547
2548 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2549 /* Sanity check (later). */
2550 ptr = match->next;
2551 end = ptr + len;
2552 while (ptr < end) {
2553 SLJIT_ASSERT(ptr[1] == -1);
2554 ptr += no_states;
2555 }
2556
2557 /* Check number of active elements. */
2558 ptr = match->current + no_states;
2559 end = ptr + len - no_states;
2560 count = 0;
2561 while (ptr < end) {
2562 if (ptr[1] != -1)
2563 count++;
2564 ptr += no_states;
2565 }
2566
2567 /* Check chain list. */
2568 current = match->head;
2569 ptr = match->current;
2570 while (current != 0) {
2571 SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_w));
2572 SLJIT_ASSERT((current % (no_states * sizeof(sljit_w))) == 0);
2573 SLJIT_ASSERT(count > 0);
2574 current = ptr[(current / sizeof(sljit_w)) + 1];
2575 count--;
2576 }
2577 SLJIT_ASSERT(count == 0);
2578 #endif
2579
2580 if (match->fast_quit) {
2581 /* the machine has stopped working. */
2582 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2583 printf("Best match has found\n");
2584 break;
2585 }
2586
2587 input_string++;
2588 length--;
2589 }
2590 }
2591 #endif
2592