syntax.lsp revision 5dfecf96
1;;
2;; Copyright (c) 2002 by The XFree86 Project, Inc.
3;;
4;; Permission is hereby granted, free of charge, to any person obtaining a
5;; copy of this software and associated documentation files (the "Software"),
6;; to deal in the Software without restriction, including without limitation
7;; the rights to use, copy, modify, merge, publish, distribute, sublicense,
8;; and/or sell copies of the Software, and to permit persons to whom the
9;; Software is furnished to do so, subject to the following conditions:
10;;
11;; The above copyright notice and this permission notice shall be included in
12;; all copies or substantial portions of the Software.
13;;
14;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17;; THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19;; OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20;; SOFTWARE.
21;;
22;; Except as contained in this notice, the name of the XFree86 Project shall
23;; not be used in advertising or otherwise to promote the sale, use or other
24;; dealings in this Software without prior written authorization from the
25;; XFree86 Project.
26;;
27;; Author: Paulo César Pereira de Andrade
28;;
29;;
30;; $XFree86: xc/programs/xedit/lisp/modules/syntax.lsp,v 1.11 2003/01/16 03:50:46 paulo Exp $
31;;
32
33(provide "syntax")
34(require "xedit")
35(in-package "XEDIT")
36
37(defvar *syntax-symbols* '(
38    syntax-highlight defsyntax defsynprop synprop-p syntax-p
39    syntable syntoken synaugment
40    *prop-default* *prop-keyword* *prop-number* *prop-string*
41    *prop-constant* *prop-comment* *prop-preprocessor*
42    *prop-punctuation* *prop-error* *prop-annotation*
43))
44(export *syntax-symbols*)
45(in-package "USER")
46(dolist (symbol xedit::*syntax-symbols*)
47    (import symbol)
48)
49(in-package "XEDIT")
50(makunbound '*syntax-symbols*)
51
52#|
53TODO:
54o Add a command to match without increment the offset in the input, this
55  may be useful for example in a case like:
56	some-table
57	    match "<"
58		switch -1
59	match "<"	<- the table already eated this, so it won't be matched.
60  This must be carefully checked at compile time, such instruction should
61  be in a token that returns or starts a new one, and even then, may need
62  runtime check to make sure it won't enter an infinite loop.
63o Allow combining properties, this is supported in Xaw, and could allow some
64  very interesting effects for complex documents.
65o Maybe have an separated function/loop for tables that don't have tokens
66  that start/switch to another table, and/or have the contained attribute set.
67  This could allow running considerably faster.
68o Do a better handling of interactive edition for tokens that start and end
69  with the same pattern, as an example strings, if the user types '"', it
70  will parse up to the end of the file, "inverting" all strings.
71o Allow generic code to be run once a match is found, such code could handle
72  some defined variables and take decisions based on the parser state. This
73  should be detected at compile time, to maybe run a different parser for
74  such syntax tables, due to the extra time building the environment to
75  call the code. This would be useful to "really" parse documents with
76  complex syntax, for example, a man page source file.
77o Add command to change current default property without initializing a new
78  state.
79o Fix problems matching EOL. Since EOL is an empty string match, if there
80  is a rule to match only EOL, but some other rule matches up to the end
81  of the input, the match to EOL will not be recognized. Currently the only
82  way to handle this is to have a nested table that always returns once a
83  match is found, so that it will restart the match loop code even if the
84  input is at EOL.
85  One possible solution would be to add the ending newline to the input,
86  and then instead of matching "$", should match "\\n".
87o XXX Usage of the variable newline-property must be reviewed in function
88  syntax-highlight, if the text property has a background attribute,
89  visual effect will look "strange", will paint a square with the
90  background attribute at the end of every line in the matched text.
91|#
92
93;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
94;; Some annotations to later write documentation for the module...
95;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
96#|
97    The current interface logic should be easy to understand for people
98that have written lex scanners before. It has some extended semantics,
99that could be translated to stacked BEGIN() statements in lex, but
100currently does not have rules for matches in the format RE/TRAILING, as
101well as code attached to rules (the biggest difference) and/or things
102like REJECT and unput(). Also, at least currently, it is *really* quite
103slower than lex.
104
105	MATCHING RULES
106	--------------
107    When two tokens are matched at the same input offset, the longest
108token is used, if the length is the same, the first definition is
109used. For example:
110	token1	=>	int
111	token2	=>	[A-Za-z]+
112	input	=>	integer
113    Token1 matches "int" and token2 matches "integer", but since token2 is
114longer, it is used. But in the case:
115	token1	=>	int
116	token2	=>	[A-Za-z]+
117	input	=>	int
118    Both, token1 and token2 match "int", since token1 is defined first, it
119is used.
120|#
121
122
123;;  Initialize some default properties that may be shared in syntax
124;; highlight definitions. Use of these default properties is encouraged,
125;; so that "tokens" will be shown identically when editing program
126;; sources in different programming languages.
127(defsynprop *prop-default*
128    "default"
129    :font	"*courier-medium-r*-12-*"
130    :foreground	"black")
131
132(defsynprop *prop-keyword*
133    "keyword"
134    :font	"*courier-bold-r*-12-*"
135    :foreground	"gray12")
136
137(defsynprop *prop-number*
138    "number"
139    :font	"*courier-bold-r*-12-*"
140    :foreground	"OrangeRed3")
141
142(defsynprop *prop-string*
143    "string"
144    :font	"*lucidatypewriter-medium-r*-12-*"
145    :foreground	"RoyalBlue2")
146
147(defsynprop *prop-constant*
148    "constant"
149    :font	"*lucidatypewriter-medium-r*-12-*"
150    :foreground	"VioletRed3")
151
152(defsynprop *prop-comment*
153    "comment"
154    :font	"*courier-medium-o*-12-*"
155    :foreground	"SlateBlue3")
156
157(defsynprop *prop-preprocessor*
158    "preprocessor"
159    :font	"*courier-medium-r*-12-*"
160    :foreground	"green4")
161
162(defsynprop *prop-punctuation*
163    "punctuation"
164    :font	"*courier-bold-r*-12-*"
165    :foreground	"gray12")
166
167;; Control characters, not always errors...
168(defsynprop *prop-control*
169    "control"
170    :font	"*courier-bold-r*-12-*"
171    :foreground	"yellow2"
172    :background	"red3")
173
174(defsynprop *prop-error*
175    "error"
176    :font	"*new century schoolbook-bold*-24-*"
177    :foreground	"yellow"
178    :background	"red")
179
180(defsynprop *prop-annotation*
181    "annotation"
182    :font	"*courier-medium-r*-12-*"
183    :foreground	"black"
184    :background	"PaleGreen")
185
186
187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188;;  The "main" definition of the syntax highlight coding interface.
189;;  Creates a "special" variable with the given name, associating to
190;; it an already compiled syntax table.
191;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
192(defmacro defsyntax (variable label property indent options &rest lists)
193    `(if (boundp ',variable)
194	,variable
195	(progn
196	    (proclaim '(special ,variable))
197	    (setq ,variable
198		(compile-syntax-table
199		    (string ',variable) ,options
200		    (syntable ,label ,property ,indent ,@lists)
201		)
202	    )
203	)
204    )
205)
206
207;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
208;;  Just a wrapper to create a hash-table and bound it to a symbol.
209;;  Example of call:
210;;	(defsynoptions *my-syntax-options*
211;;	    (:indent		.	8)
212;;	    (:indent-option-1	.	1)
213;;	    (:indent-option-2	.	2)
214;;	)
215;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
216(defmacro defsynoptions (variable &rest options)
217    `(if (boundp ',variable)
218	,variable
219	(progn
220	    (proclaim '(special ,variable))
221	    (setq ,variable (make-hash-table :initial-contents ',options))
222	)
223    )
224)
225
226;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
227;; These definitions should be "private".
228;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
229(defstruct syntoken
230    regex		;; A compiled regexp.
231    property		;; NIL for default, or a synprop structure.
232    contained		;; Only used when switch/begin is not NIL. Values:
233			;;	NIL	  -> just switch to or begin new
234			;;		     syntax table.
235			;;	(not NIL) -> apply syntoken property
236			;;		     (or default one) to matched
237			;;		     text *after* switching to or
238			;;		     beginning a new syntax table.
239    switch		;; Values for switch are:
240			;;	NIL	  -> do nothing
241			;;	A keyword -> switch to the syntax table
242			;;		     identified by the keyword.
243			;;	A negative integer -> Pop the stack
244			;;			      -<swich-value> times.
245			;;			      A common value is -1,
246			;;			     to switch to the previous
247			;;			     state, but some times
248			;;			     it is desired to return
249			;;			     two or more times in
250			;;			     in the stack.
251			;;  NOTE: This is actually a jump, the stack is
252			;; popped until the named syntax table is found,
253			;; if the stack becomes empty, a new state is
254			;; implicitly created.
255    begin		;;  NIL or a keyword (like switch), but instead of
256			;; popping the stack, it pushes the current syntax
257			;; table to the stack and sets a new current one.
258)
259
260
261;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
262;; Just a wrapper to make-syntoken.
263;;	TODO: Add support for structure constructors.
264;;	XXX: Note that the NOSUB only works with the xedit regex, it
265;; will still return the match offsets, but will ignore subexpressions,
266;; that is, parenthesis are used only for grouping.
267;;	TODO: Create a new version of the re-exec call that returns
268;; offsets in the format (<from> . <to>) and not
269;; ((<from0> . <to0>) ... (<fromN> . <toN>)). Only the global result
270;; is expected/used, so there is no reason to allocate more than one
271;; cons cell per call.
272;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
273(defun syntoken (pattern
274		 &key icase nospec property contained switch begin (nosub t)
275		 &aux
276		 (regex
277		    (re-comp pattern :icase icase :nospec nospec :nosub nosub)
278		 )
279		 check)
280
281    ;;  Don't allow a regex that matches the null string enter the
282    ;; syntax table list.
283    (if (consp (setq check (re-exec regex "" :noteol t :notbol t)))
284#+xedit	(error "SYNTOKEN: regex matches empty string ~S" regex)
285#-xedit	()
286    )
287
288    (make-syntoken
289	:regex		regex
290	:property	property
291	:contained	contained
292	:switch		switch
293	:begin		begin
294    )
295)
296
297
298;;  This structure is defined only to do some type checking, it just
299;; holds a list of keywords.
300(defstruct synaugment
301    labels		;; List of keywords labeling syntax tables.
302)
303
304(defstruct syntable
305    label		;; A keyword naming this syntax table.
306    property		;; NIL or a default synprop structure.
307    indent		;; Indentation function for the syntax table.
308    tokens		;; A list of syntoken structures.
309    tables		;; A list of syntable structures.
310    augments		;;  A list of synaugment structures, used only
311			;; at "compile time", so that a table can be
312			;; used before it's definition.
313    bol			;;  One of the tokens match the empty string at
314			;; the start of a line (loop optimization hint).
315			;; Field filled at "link" time.
316    eol			;;  Same comments as bol, but in this case, for
317			;; the empty string at the end of a line.
318)
319
320
321;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322;;  Just call make-syntable, but sorts the elements by type, allowing
323;; a cleaner code when defining the syntax highlight rules.
324;; XXX Same comments as for syntoken about the use of a constructor for
325;; structures. TODO: when/if clos is implemented in the interpreter.
326;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
327(defun syntable (label default-property indent &rest definitions)
328
329    ;; Check for possible errors in the arguments.
330    (unless (keywordp label)
331	(error "SYNTABLE: ~A is not a keyword" label)
332    )
333    (unless
334	(or
335	    (null default-property)
336	    (synprop-p default-property)
337	)
338	(error "SYNTABLE: ~A is an invalid text property"
339	    default-property
340	)
341    )
342
343    ;; Don't allow unknown data in the definition list.
344    ;; XXX typecase should be added to the interpreter, and since
345    ;;     the code is traversing the entire list, it could build
346    ;;     now the arguments to make-syntable.
347    (dolist (item definitions)
348	(unless
349	    (or
350
351		;;  Allow NIL in the definition list, so that one
352		;; can put conditionals in the syntax definition,
353		;; and if the conditional is false, fill the slot
354		;; with a NIL value.
355		(atom item)
356		(syntoken-p item)
357		(syntable-p item)
358		(synaugment-p item)
359	    )
360	    (error "SYNTABLE: invalid syntax table argument ~A" item)
361	)
362    )
363
364    ;; Build the syntax table.
365    (make-syntable
366	:label		label
367	:property	default-property
368	:indent		indent
369	:tokens		(remove-if-not #'syntoken-p definitions)
370	:tables		(remove-if-not #'syntable-p definitions)
371	:augments	(remove-if-not #'synaugment-p definitions)
372    )
373)
374
375
376;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
377;;  Just to do a "preliminary" error checking, every element must be a
378;; a keyword, and also check for reserved names.
379;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
380(defun synaugment (&rest keywords)
381    (dolist (keyword keywords)
382	(unless (keywordp keyword)
383	    (error "SYNAUGMENT: bad syntax table label ~A" keyword)
384	)
385    )
386    (make-synaugment :labels keywords)
387)
388
389
390;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391;; Recursive compile utility function.
392;; Returns a cons in the format:
393;;	car	=>	List of all syntoken structures
394;;			(including child tables).
395;;	cdr	=>	List of all child syntable structures.
396;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
397(defun list-syntable-elements (table &aux result sub-result)
398    (setq
399	result
400	(cons
401	    (syntable-tokens table)
402	    (syntable-tables table))
403    )
404
405    ;; For every child syntax table.
406    (dolist (child (syntable-tables table))
407
408	;; Recursively call list-syntable-elements.
409	(setq sub-result (list-syntable-elements child))
410
411	(rplaca result (append (car result) (car sub-result)))
412	(rplacd result (append (cdr result) (cdr sub-result)))
413    )
414
415    ;; Return the pair of nested tokens and tables.
416    result
417)
418
419
420;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
421;;  Append tokens of the augment list to the tokens of the specified
422;; syntax table.
423;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
424(defun compile-syntax-augment-list (table table-list
425				    &aux labels augment tokens)
426
427    ;; Create a list of all augment tables.
428    (dolist (augment (syntable-augments table))
429	(setq labels (append labels (synaugment-labels augment)))
430    )
431
432    ;;  Remove duplicates and references to "itself",
433    ;; without warnings?
434    (setq
435	labels
436	(remove
437	    (syntable-label table)
438	    (remove-duplicates labels :from-end t)
439	)
440    )
441
442    ;; Check if the specified syntax tables exists!
443    (dolist (label labels)
444	(unless
445	    (setq
446		augment
447		(car (member label table-list :key #'syntable-label))
448	    )
449	    (error "COMPILE-SYNTAX-AUGMENT-LIST: Cannot augment ~A in ~A"
450		label
451		(syntable-label table)
452	    )
453	)
454
455	;; Increase list of tokens.
456	(setq tokens (append tokens (syntable-tokens augment)))
457    )
458
459    ;;  Store the tokens in the augment list. They will be added
460    ;; to the syntax table in the second pass.
461    (setf (syntable-augments table) tokens)
462
463    ;;  Recurse on every child table.
464    (dolist (child (syntable-tables table))
465	(compile-syntax-augment-list child table-list)
466    )
467)
468
469
470;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
471;;  Just add the augmented tokens to the token list, recursing on
472;; every child syntax table.
473;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
474(defun link-syntax-augment-table (table)
475    (setf
476	(syntable-tokens table)
477	;;  When augmenting a table, duplicated tokens or different tokens
478	;; that use the same regex pattern should be common.
479	(remove-duplicates
480	    (nconc (syntable-tokens table) (syntable-augments table))
481	    :key	#'syntoken-regex
482	    :test	#'equal
483	    :from-end	t
484	)
485
486	;;  Don't need to keep this list anymore.
487	(syntable-augments table)
488	()
489    )
490
491    ;;  Check if one of the tokens match the empty string at the
492    ;; start or end of a text line. XXX The fields bol and eol
493    ;; are expected to be initialized to NIL.
494    (dolist (token (syntable-tokens table))
495	(when (consp (re-exec (syntoken-regex token) "" :noteol t))
496	    (setf (syntable-bol table) t)
497	    (return)
498	)
499    )
500    (dolist (token (syntable-tokens table))
501	(when (consp (re-exec (syntoken-regex token) "" :notbol t))
502	    (setf (syntable-eol table) t)
503	    (return)
504	)
505    )
506
507    (dolist (child (syntable-tables table))
508	(link-syntax-augment-table child)
509    )
510)
511
512
513;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
514;; "Compile" the main structure of the syntax highlight code.
515;; Variables "switches" and "begins" are used only for error checking.
516;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
517(defun compile-syntax-table (name options main-table &aux syntax elements
518			     switches begins tables properties)
519    (unless (stringp name)
520	(error "COMPILE-SYNTAX-TABLE: ~A is not a string" name)
521    )
522
523    (setq
524	elements
525	(list-syntable-elements main-table)
526
527	switches
528	(remove-if
529	    #'null
530	    (car elements)
531	    :key #'syntoken-switch
532	)
533
534	begins
535	(remove-if-not
536	    #'keywordp
537	    (car elements)
538	    :key #'syntoken-begin
539	)
540
541	;;  The "main-table" isn't in the list, because
542	;; list-syntable-elements includes only the child tables;
543	;; this is done to avoid the need of removing duplicates here.
544	tables
545	(cons main-table (cdr elements))
546    )
547
548    ;; Check for typos in the keywords, or for not defined syntax tables.
549    (dolist (item (mapcar #'syntoken-switch switches))
550	(unless
551	    (or
552		(and
553		    (integerp item)
554		    (minusp item)
555		)
556		(member item tables :key #'syntable-label)
557	    )
558	    (error "COMPILE-SYNTAX-TABLE: SWITCH ~A cannot be matched"
559		item
560	    )
561	)
562    )
563    (dolist (item (mapcar #'syntoken-begin begins))
564	(unless (member item tables :key #'syntable-label)
565	    (error "COMPILE-SYNTAX-TABLE: BEGIN ~A cannot be matched"
566		item
567	    )
568	)
569    )
570
571    ;; Create a list of all properties used by the syntax.
572    (setq
573	properties
574	(delete-duplicates
575
576	    ;; Remove explicitly set to "default" properties.
577	    (remove nil
578
579		(append
580
581		    ;; List all properties in the syntoken list.
582		    (mapcar
583			#'syntoken-property
584			(car elements)
585		    )
586
587		    ;; List all properties in the syntable list.
588		    (mapcar
589			#'syntable-property
590			tables
591		    )
592		)
593	    )
594	    :test #'string=
595	    :key  #'synprop-name
596	)
597    )
598
599    ;;  Provide a default property if none specified.
600    (unless
601	(member
602	    "default"
603	    properties
604	    :test #'string=
605	    :key #'synprop-name
606	)
607	(setq properties (append (list *prop-default*) properties))
608    )
609
610
611    ;;  Now that a list of all nested syntax tables is known, compile the
612    ;; augment list. Note that even the main-table can be augmented to
613    ;; include tokens of one of it's children.
614
615    ;;  Adding the tokens of the augment tables must be done in
616    ;; two passes, or it may cause surprises due to "inherited"
617    ;; tokens, as the augment table was processed first, and
618    ;; increased it's token list.
619    (compile-syntax-augment-list main-table tables)
620
621    ;;  Now just append the augmented tokens to the table's token list.
622    (link-syntax-augment-table main-table)
623
624    ;;  Change all syntoken switch and begin fields to point to the
625    ;; syntable.
626    (dolist (item switches)
627	(if (keywordp (syntoken-switch item))
628	    ;;  A switch may be relative, check if a keyword
629	    ;; was specified.
630	    (setf
631		(syntoken-switch item)
632		(car
633		    (member
634			(syntoken-switch item)
635			tables
636			:key #'syntable-label
637		    )
638		)
639	    )
640	)
641    )
642    (dolist (item begins)
643	(setf
644	    (syntoken-begin item)
645	    (car
646		(member
647		    (syntoken-begin item)
648		    tables
649		    :key #'syntable-label
650		)
651	    )
652	)
653    )
654
655    ;;  Don't need to add a entity for default properties
656    (dolist (item (car elements))
657	(and
658	    (syntoken-property item)
659	    (string= (synprop-name (syntoken-property item)) "default")
660	    (setf (syntoken-property item) ())
661	)
662    )
663    (dolist (item tables)
664	(and
665	    (syntable-property item)
666	    (string= (synprop-name (syntable-property item)) "default")
667	    (setf (syntable-property item) ())
668	)
669    )
670
671    (setq syntax
672	(make-syntax
673	    :name	name
674	    :options	options
675	    :labels	tables
676	    :quark
677		(compile-syntax-property-list
678		    name
679		    properties
680		)
681	    :token-count
682		(length (car elements))
683	)
684    )
685
686    ;; Ready to run!
687)
688
689
690;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
691;;  Loop applying the specifed syntax table to the text.
692;;  XXX This function needs a review. Should compile the regex patterns
693;; with newline sensitive match (and scan the entire file), and keep a
694;; cache of matched tokens (that may be at a very longer offset), and,
695;; when the match is removed from the cache, readd the token to the
696;; token-list; if the token does not match, it will not be in the cache,
697;; but should be removed from the token-list. If properly implemented, it
698;; should be somewhat like 4 times faster, but I would not be surprised
699;; if it becames even faster.
700;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
701(defun syntax-highlight (*syntax*
702			 &optional
703			 (*from* (point-min))
704			 (*to* (point-max))
705			 interactive
706			 &aux
707#+debug			 (*line-number* 0)
708			 stream
709			 indent-table
710			)
711
712    ;;  Make sure the property list is in use.
713    ;;  The interactive flag is only set after loading the file.
714    (or interactive
715	(property-list (syntax-quark *syntax*))
716    )
717
718#+debug
719    (setq *from* 0 *to* 0)
720
721#-debug
722    (and (>= *from* *to*) (return-from syntax-highlight (values *from* nil)))
723
724    ;;  Remove any existing properties from the text.
725    (clear-entities *from* (1+ *to*))
726
727    (setq stream
728#-debug	(make-string-input-stream (read-text *from* (- *to* *from*)))
729#+debug	*standard-input*
730    )
731
732    (prog*
733	(
734	;;  Used to check if end of file found but syntax stack did
735	;; not finish.
736	(point-max (point-max))
737
738	;;  Used in interactive mode, to return the syntax table
739	;; where the cursor is located.
740	(point (point))
741
742	;;  The current stack of states.
743	stack
744
745	;;  The current syntable.
746	(syntax-table (car (syntax-labels *syntax*)))
747
748	;;  The current syntable's default property.
749	(default-property (syntable-property syntax-table))
750
751	;;  Add this property to newlines as a hint to the interactive
752	;; callback, so that it knows from where to restart parsing.
753	newline-property
754
755	;;  The tokens in the current syntax table that may match,
756	;; i.e. the items in this list are not in nomatch.
757	token-list
758
759	;;  A pointer to the syntable token list, if token-list is
760	;; eq to this value, cannot change it inplace.
761	current-token-list
762
763	;;  Help to avoid allocating too many new object cells, and
764	;; optmizes a bit time in [n]?set-difference.
765	;;  This optimizes only the processing of one line of text
766	;; as nomatch must be rebuilt when reading a new line of text.
767	token-list-stack
768
769	;;  Matches for the current list of tokens.
770	matches
771
772	;;  Line of text.
773	line
774
775	;;  Length of the text line.
776	length
777
778	;;  A inverse cache, don't call re-exec when the regex is
779	;; already known to not match.
780	nomatch
781
782	;;  Use cache as a list of matches to avoid repetitive
783	;; unnecessary calls to re-exec.
784	;;  cache is a list in which every element has the format:
785	;;	(token . (start . end))
786	;;  Line of text.
787	cache
788
789	;;  Used just to avoid a function call at every re-exec call.
790	notbol
791
792	match
793
794	start
795	left
796	right
797	result
798	property
799
800	;;  Beginig a new syntax table?
801	begin
802
803	;;  Switching to another syntax table?
804	switch
805
806	;;  Property flag when changing the current syntax table.
807	contained
808
809	;;  Flag to know if syntax table has changed.
810	change
811
812	;;  Variables used when removing invalid elements from the
813	;; the cache.
814	item
815	from
816	to
817	)
818
819;-----------------------------------------------------------------------
820:read
821#+debug-verbose
822	(format t "** Entering :READ stack length is ~D~%" (length stack))
823#+debug	(format t "~%[~D]> " (incf *line-number*))
824
825	;;  If input has finished, return.
826	(unless (setq line (read-line stream nil nil))
827	    (when
828		(and
829		    ;; If a nested syntax table wasn't finished
830		    (consp stack)
831		    (<
832			(setq *to* (scan *from* :eol :right))
833			point-max
834		    )
835		)
836		(setq line (read-text *from* (- *to* *from*)))
837		(clear-entities *from* (1+ *to*))
838		(go :again)
839	    )
840#-debug	    (close stream)
841	    (return)
842	)
843
844;------------------------------------------------------------------------
845:again
846	(setq
847	    start		0
848	    length		(length line)
849	    token-list		(syntable-tokens syntax-table)
850	    current-token-list	token-list
851	    token-list-stack	()
852	    nomatch		()
853	    cache		()
854	)
855
856
857	;;  If empty line, and current table does not have matches for
858	;; the empty string at start or end of a text line.
859	(when
860	    (and
861		(= length 0)
862		(not (syntable-eol syntax-table))
863		(not (syntable-bol syntax-table)))
864#+debug-verbose
865	    (format t "Empty line and table has no match to bol or eol~%")
866
867	    (and newline-property
868		(add-entity *from* 1 (synprop-quark newline-property)))
869	    (go :update)
870	)
871
872;------------------------------------------------------------------------
873:loop
874#+debug-verbose
875	(format t "** Entering :LOOP at offset ~D in table ~A, cache has ~D items~%"
876	    start
877	    (syntable-label syntax-table)
878	    (length cache))
879
880	(setq notbol (> start 0))
881
882	;;  For every token that may match.
883	(dolist
884	    (token
885		(setq
886		    token-list
887		    (if (eq token-list current-token-list)
888			(set-difference token-list nomatch :test #'eq)
889			(nset-difference token-list nomatch :test #'eq)
890		    )
891		)
892	    )
893
894	    ;;	Try to fetch match from cache.
895	    (if (setq match (member token cache :test #'eq :key #'car))
896		;;  Match is in the cache.
897
898		(progn
899		    ;;	Match must be moved to the beginning of the
900		    ;; matches list, as a match from another syntax
901		    ;; table may be also in the cache, but before
902		    ;; the match for the current token.
903#+debug-verbose	    (format t "cached: {~A:~S} ~A~%"
904			(cdar match)
905			(subseq line (cadar match) (cddar match))
906			(syntoken-regex token))
907
908		    ;;	Remove the match from the cache.
909		    (if (eq match cache)
910
911			;;  This could be changed to only set "matches"
912			;; if it is not the first element of cache,
913			;; but is unsafe, because other tokens may
914			;; be added to "matches", and will end up
915			;; before when joining "matches" and "cache".
916			(progn
917			    (setq cache (cdr cache))
918			    (rplacd match matches)
919			    (setq matches match))
920
921			(progn
922			    (if (= (length match) 1)
923				(progn
924				    (rplacd (last cache 2) nil)
925				    (rplacd match matches)
926				    (setq matches match))
927				(progn
928				    (setq matches (cons (car match) matches))
929				    (rplaca match (cadr match))
930				    (rplacd match (cddr match)))
931			    )
932			)
933		    )
934
935		    ;;	Exit loop if the all the remaining
936		    ;; input was matched.
937		    (when
938			(and
939			    (= start (cadar match))
940			    (= length (cddar match))
941			)
942#+debug-verbose 	(format t "Rest of line match~%")
943			(return)
944		    )
945		)
946
947		;;  Not in the cache, call re-exec.
948		(if
949		    (consp
950			(setq
951			    match
952			    (re-exec
953				(syntoken-regex token)
954				line
955				:start	start
956				:notbol	notbol)))
957
958		    ;;	Match found.
959		    (progn
960#+debug-verbose		(format t "Adding to cache: {~A:~S} ~A~%"
961			    (car match)
962			    (subseq line (caar match) (cdar match))
963			    (syntoken-regex token))
964
965			;; Only the first pair is used.
966			(setq match (car match))
967
968			(cond
969			    (
970				(or
971				    (null matches)
972				    ;;	No overlap and after most
973				    ;; recent match.
974				    (>= (car match) (cddar matches))
975				    ;; No overlap and before most
976				    ;; recent match.
977				    (<= (cdr match) (cadar matches))
978				)
979				(setq
980				    matches
981				    (cons (cons token match) matches)
982				)
983			    )
984			    (
985				(or
986				    ;;	Overlap, but start before most
987				    ;; recent match.
988				    (< (car match) (cadar matches))
989				    (and
990					;;  Same offset as most recent
991					;; match, but is longer.
992					(= (car match) (cadar matches))
993					(> (cdr match) (cddar matches))
994				    )
995				)
996				(rplaca (car matches) token)
997				(rplacd (car matches) match)
998#+debug-verbose 		(format t "Replaced most recent match~%")
999			    )
1000			    (t
1001#+debug-verbose 		(format t "Ignored~%")
1002				;; XXX The interpreter does not yet implement
1003				;; implicit tagbody in dolist, just comment
1004				;; the go call in that case. (Will just do
1005				;; an unecessary test...)
1006				(go :ignored)
1007			    )
1008			)
1009
1010			;;  Exit loop if the all the remaining
1011			;; input was matched.
1012			(when
1013			    (and
1014				(= start (car match))
1015				(= length (cdr match)))
1016#+debug-verbose 	    (format t "Rest of line match~%")
1017			    (return))
1018		    )
1019
1020		    ;;	Match not found.
1021		    (progn
1022#+debug-verbose 	(format t "Adding to nomatch: ~A~%"
1023			    (syntoken-regex token))
1024			(setq nomatch (cons token nomatch)))
1025		)
1026	    )
1027:ignored
1028	)
1029
1030	;;  Add matches to the beginning of the cache list.
1031	(setq
1032	    ;;	Put matches with smaller offset first.
1033	    cache
1034	    (stable-sort (nconc (nreverse matches) cache) #'< :key #'cadr)
1035
1036	    ;;	Make sure that when the match loop is reentered, this
1037	    ;; variable is NIL.
1038	    matches
1039	    ()
1040	)
1041
1042	;;  While the first entry in the cache is not from the current table.
1043	(until (or (null cache) (member (caar cache) token-list :test #'eq))
1044
1045#+debug-verbose
1046	    (format t "Not in the current table, removing {~A:~S} ~A~%"
1047		(cdar cache)
1048		(subseq line (cadar cache) (cddar cache))
1049		(syntoken-regex (caar cache)))
1050
1051	    (setq cache (cdr cache))
1052	)
1053
1054
1055	;;  If nothing was matched in the entire/remaining line.
1056	(unless cache
1057	    (when default-property
1058		(if
1059		    (or
1060			(null result)
1061			(> start (cadar result))
1062			(not (eq (cddar result) default-property)))
1063		    (setq
1064			result
1065			(cons
1066			    (cons start (cons length default-property))
1067			    result
1068			)
1069		    )
1070		    (rplaca (cdar result) length)
1071		)
1072	    )
1073
1074#+debug-verbose
1075	    (format t "No match until end of line~%")
1076
1077	    ;;  Result already known, and there is no syntax table
1078	    ;; change, bypass :PARSE.
1079	    (and interactive
1080		(null indent-table)
1081		(<= 0 (- point *from*) length)
1082		(setq indent-table syntax-table))
1083	    (go :process)
1084	)
1085
1086#+debug-verbose
1087	(format t "Removing first candidate from cache {~A:~S} ~A~%"
1088	    (cdar cache)
1089	    (subseq line (cadar cache) (cddar cache))
1090	    (syntoken-regex (caar cache))
1091	)
1092
1093	;;  Prepare to choose best match.
1094	(setq
1095	    match	(car cache)
1096	    left	(cadr match)
1097	    right	(cddr match)
1098	    cache	(cdr cache)
1099	)
1100
1101	;;  First element can be safely removed now.
1102	;;  If there is only one, skip loop below.
1103	(or cache (go :parse))
1104
1105	;;  Remove elements of cache that must be discarded.
1106	(setq
1107	    item   (car cache)
1108	    from   (cadr item)
1109	    to     (cddr item)
1110	)
1111
1112	(loop
1113	    (if
1114		(or
1115
1116		    ;;	If everything removed from the cache.
1117		    (null item)
1118
1119		    ;;	Or next item is at a longer offset than the
1120		    ;; end of current match.
1121		    (>= from right)
1122		)
1123		(return)
1124	    )
1125
1126	    (and
1127		;;  If another match at the same offset.
1128		(= left from)
1129
1130		;;  And if this match is longer than the current one.
1131		(> to right)
1132
1133		(member (car item) token-list :test #'eq)
1134
1135		(setq
1136		    match   item
1137		    right   to
1138		)
1139	    )
1140
1141#+debug-verbose
1142	    (format t "Removing from cache {~A:~S} ~A~%"
1143		(cdar cache)
1144 		(subseq line from to)
1145		(syntoken-regex (caar cache)))
1146
1147	    (setq
1148		cache	    (cdr cache)
1149		item	    (car cache)
1150		from	    (cadr item)
1151		to	    (cddr item)
1152	    )
1153	)
1154
1155
1156;-----------------------------------------------------------------------
1157:parse
1158#+debug-verbose
1159	(format t "** Entering :PARSE~%")
1160
1161	(setq
1162
1163	    ;;  Change match value to the syntoken.
1164	    match	(car match)
1165
1166	    begin	(syntoken-begin match)
1167	    switch	(syntoken-switch match)
1168	    contained	(syntoken-contained match)
1169	    change	(or begin switch)
1170	)
1171
1172	;;  Check for unmatched leading text.
1173	(when (and default-property (> left start))
1174#+debug-verbose (format t "No match in {(~D . ~D):~S}~%"
1175		start
1176		left
1177		(subseq line start left)
1178	    )
1179	    (if
1180		(or
1181		    (null result)
1182		    (> start (cadar result))
1183		    (not (eq (cddar result) default-property)))
1184		(setq
1185		    result
1186		    (cons
1187			(cons start (cons left default-property))
1188			result
1189		    )
1190		)
1191		(rplaca (cdar result) left)
1192	    )
1193	)
1194
1195	;;  If the syntax table is not changed,
1196	;; or if the new table requires that the
1197	;; current default property be used.
1198	(unless (and change contained)
1199
1200	    (and
1201		(> right left)
1202		(setq
1203		    property
1204		    (or
1205			;;  If token specifies the property.
1206			(syntoken-property match)
1207			default-property
1208		    )
1209		)
1210
1211		;;  Add matched text.
1212		(if
1213		    (or
1214			(null result)
1215			(> left (cadar result))
1216			(not (eq (cddar result) property))
1217		    )
1218		    (setq
1219			result
1220			(cons
1221			    (cons left (cons right property))
1222			    result
1223			)
1224		    )
1225		    (rplaca (cdar result) right)
1226		)
1227	    )
1228
1229#+debug-verbose
1230	    (format t "(0)Match found for {(~D . ~D):~S}~%"
1231		left
1232		right
1233		(subseq line left right)
1234	    )
1235	)
1236
1237
1238	;;  Update start offset in the input now!
1239	(and interactive
1240	    (null indent-table)
1241	    (<= start (- point *from*) right)
1242	    (setq indent-table syntax-table))
1243	(setq start right)
1244
1245
1246	;;  When changing the current syntax table.
1247	(when change
1248	    (when switch
1249		(if (numberp switch)
1250
1251		    ;;	If returning to a previous state.
1252		    ;;	Don't generate an error if the stack
1253		    ;; becomes empty?
1254		    (while
1255			(< switch 0)
1256
1257			(setq
1258			    syntax-table	(pop stack)
1259			    token-list		(pop token-list-stack)
1260			    switch		(1+ switch)
1261			)
1262		    )
1263
1264		    ;;	Else, not to a previous state, but
1265		    ;; returning to a named syntax table,
1266		    ;; search for it in the stack.
1267		    (while
1268			(and
1269
1270			    (setq
1271				token-list	(pop token-list-stack)
1272				syntax-table	(pop stack)
1273			    )
1274
1275			    (not (eq switch syntax-table))
1276			)
1277			;;  Empty loop.
1278		    )
1279		)
1280
1281		;;  If no match found while popping
1282		;; the stack.
1283		(unless syntax-table
1284
1285		    ;;	Return to the topmost syntax table.
1286		    (setq
1287			syntax-table
1288			(car (syntax-labels *syntax*))
1289		    )
1290		)
1291
1292#+debug-verbose	(format t "switching to ~A offset: ~D~%"
1293		    (syntable-label syntax-table)
1294		    start
1295		)
1296
1297		(if (null token-list)
1298		    (setq token-list (syntable-tokens syntax-table))
1299		)
1300	    )
1301
1302	    (when begin
1303		;;  Save state for a possible
1304		;; :SWITCH later.
1305		(setq
1306		    stack	     (cons syntax-table stack)
1307		    token-list-stack (cons token-list token-list-stack)
1308		    token-list	     (syntable-tokens begin)
1309		    syntax-table     begin
1310		)
1311#+debug-verbose	(format t "begining ~A offset: ~D~%"
1312		    (syntable-label syntax-table)
1313		    start
1314		)
1315	    )
1316
1317	    ;;	Change current syntax table.
1318	    (setq
1319		default-property    (syntable-property syntax-table)
1320		current-token-list  (syntable-tokens syntax-table)
1321	    )
1322
1323	    ;;  Set newline property, to help interactive callback
1324	    ;;  Only need to have a defined value, for now don't care
1325	    ;; about wich value is being used, neither if there is
1326	    ;; a value to be set.
1327	    (if (null stack)
1328		(setq newline-property nil)
1329		(or newline-property
1330		    (setq newline-property default-property)
1331		    (setq newline-property (syntoken-property match))
1332		)
1333	    )
1334
1335	    ;; If processing of text was deferred.
1336	    (when contained
1337
1338		(and
1339		    (> right left)
1340		    (setq
1341			property
1342			(or
1343			    (syntoken-property match)
1344			    default-property
1345			)
1346		    )
1347		    ;;	Add matched text with the updated property.
1348		    (if
1349			(or
1350			    (null result)
1351			    (> left (cadar result))
1352			    (not (eq (cddar result) property))
1353			)
1354			(setq
1355			    result
1356			    (cons
1357				(cons left (cons right property))
1358				result
1359			    )
1360			)
1361			(rplaca (cdar result) right)
1362		    )
1363		)
1364
1365#+debug-verbose (format t "(1)Match found for {(~D . ~D):~S}~%"
1366		    left
1367		    right
1368		    (subseq line left right)
1369		)
1370	    )
1371
1372	    (go :loop)
1373	)
1374
1375
1376;-----------------------------------------------------------------------
1377	;;  Wait for the end of the line to process, so that
1378	;; it is possible to join sequential matches with the
1379	;; same text property.
1380	(and (or cache (< start length)) (go :loop))
1381:process
1382
1383#+debug-verbose
1384	(format t "** Entering :PROCESS~%")
1385
1386	(if result
1387	    (progn
1388		;;  If the last property was at the end of the line,
1389		;; there are nested syntax tables, and there is a
1390		;; default property, include the newline in the property,
1391		;; as a hint to the interactive callback.
1392		(and
1393		    newline-property
1394		    (if
1395			(and
1396			    (eq (cddar result) newline-property)
1397			    (= length (cadar result))
1398			)
1399			(rplaca (cdar result) (1+ length))
1400			(setq
1401			    result
1402			    (cons
1403				(cons length (cons (1+ length) newline-property))
1404				result
1405			    )
1406			)
1407		    )
1408		)
1409
1410		;;  Result was created in reversed order.
1411		(nreverse result)
1412		(dolist (item result)
1413		    (setq
1414			left		(car item)
1415			right		(cadr item)
1416			property	(cddr item))
1417
1418		    ;; Use the information.
1419		    (add-entity
1420			(+ *from* left)
1421			(- right left)
1422			(synprop-quark property))
1423		)
1424	    )
1425
1426	    (and newline-property
1427		(add-entity
1428		    (+ *from* length)
1429		    1
1430		    (synprop-quark newline-property))
1431	    )
1432	)
1433
1434;------------------------------------------------------------------------
1435:update
1436	;; Prepare for new matches.
1437	(setq
1438	    result	nil
1439
1440	    ;;	Update offset to read text.
1441	    ;;	Add 1 for the skipped newline.
1442	    *from*	(+ *from* length 1)
1443	)
1444
1445	(go :read)
1446    )
1447
1448#+debug (terpri)
1449    (values *to* indent-table)
1450)
1451
1452(compile 'syntax-highlight)
1453