aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Abrahamsen2022-12-05 21:59:03 -0800
committerEric Abrahamsen2024-03-30 15:19:47 -0700
commit8bee4060ea42c61e52ebe6487ff97bc095261050 (patch)
tree2d9d9e7ef775fdb885cb53ead590103f599a893e
parent0df8dadde2edaee406c76d639a22c70d0b03426b (diff)
downloademacs-8bee4060ea42c61e52ebe6487ff97bc095261050.tar.gz
emacs-8bee4060ea42c61e52ebe6487ff97bc095261050.zip
Add peg.el as a built-in library
* lisp/progmodes/peg.el: New file, taken from ELPA package. * test/lisp/peg-tests.el: Package tests. * doc/lispref/peg.texi: Documentation.
-rw-r--r--doc/lispref/Makefile.in1
-rw-r--r--doc/lispref/elisp.texi8
-rw-r--r--doc/lispref/peg.texi351
-rw-r--r--etc/NEWS8
-rw-r--r--lisp/progmodes/peg.el944
-rw-r--r--test/lisp/peg-tests.el367
6 files changed, 1679 insertions, 0 deletions
diff --git a/doc/lispref/Makefile.in b/doc/lispref/Makefile.in
index 0a228271be3..4ceffd7d7d3 100644
--- a/doc/lispref/Makefile.in
+++ b/doc/lispref/Makefile.in
@@ -112,6 +112,7 @@ srcs = \
112 $(srcdir)/os.texi \ 112 $(srcdir)/os.texi \
113 $(srcdir)/package.texi \ 113 $(srcdir)/package.texi \
114 $(srcdir)/parsing.texi \ 114 $(srcdir)/parsing.texi \
115 $(srcdir)/peg.texi \
115 $(srcdir)/positions.texi \ 116 $(srcdir)/positions.texi \
116 $(srcdir)/processes.texi \ 117 $(srcdir)/processes.texi \
117 $(srcdir)/records.texi \ 118 $(srcdir)/records.texi \
diff --git a/doc/lispref/elisp.texi b/doc/lispref/elisp.texi
index 71139db4359..ec93a0b9c8a 100644
--- a/doc/lispref/elisp.texi
+++ b/doc/lispref/elisp.texi
@@ -222,6 +222,7 @@ To view this manual in other formats, click
222* Non-ASCII Characters:: Non-ASCII text in buffers and strings. 222* Non-ASCII Characters:: Non-ASCII text in buffers and strings.
223* Searching and Matching:: Searching buffers for strings or regexps. 223* Searching and Matching:: Searching buffers for strings or regexps.
224* Syntax Tables:: The syntax table controls word and list parsing. 224* Syntax Tables:: The syntax table controls word and list parsing.
225* Parsing Expression Grammars:: Parsing structured buffer text.
225* Parsing Program Source:: Generate syntax tree for program sources. 226* Parsing Program Source:: Generate syntax tree for program sources.
226* Abbrevs:: How Abbrev mode works, and its data structures. 227* Abbrevs:: How Abbrev mode works, and its data structures.
227 228
@@ -1365,6 +1366,12 @@ Syntax Tables
1365* Syntax Table Internals:: How syntax table information is stored. 1366* Syntax Table Internals:: How syntax table information is stored.
1366* Categories:: Another way of classifying character syntax. 1367* Categories:: Another way of classifying character syntax.
1367 1368
1369Parsing Expression Grammars
1370
1371* PEX Definitions:: The syntax of PEX rules
1372* Parsing Actions:: Running actions upon successful parsing.
1373* Writing PEG Rules:: Tips for writing parsing rules.
1374
1368Parsing Program Source 1375Parsing Program Source
1369 1376
1370* Language Grammar:: Loading tree-sitter language grammar. 1377* Language Grammar:: Loading tree-sitter language grammar.
@@ -1720,6 +1727,7 @@ Object Internals
1720 1727
1721@include searching.texi 1728@include searching.texi
1722@include syntax.texi 1729@include syntax.texi
1730@include peg.texi
1723@include parsing.texi 1731@include parsing.texi
1724@include abbrevs.texi 1732@include abbrevs.texi
1725@include threads.texi 1733@include threads.texi
diff --git a/doc/lispref/peg.texi b/doc/lispref/peg.texi
new file mode 100644
index 00000000000..ef4dfa7653e
--- /dev/null
+++ b/doc/lispref/peg.texi
@@ -0,0 +1,351 @@
1@c -*-texinfo-*-
2@c This is part of the GNU Emacs Lisp Reference Manual.
3@c Copyright (C) 1990--1995, 1998--1999, 2001--2023 Free Software
4@c Foundation, Inc.
5@c See the file elisp.texi for copying conditions.
6@node Parsing Expression Grammars
7@chapter Parsing Expression Grammars
8@cindex text parsing
9@cindex parsing expression grammar
10
11 Emacs Lisp provides several tools for parsing and matching text,
12from regular expressions (@pxref{Regular Expressions}) to full
13@acronym{LL} grammar parsers (@pxref{Top,, Bovine parser
14development,bovine}). @dfn{Parsing Expression Grammars}
15(@acronym{PEG}) are another approach to text parsing that offer more
16structure and composibility than regular expressions, but less
17complexity than context-free grammars.
18
19A @acronym{PEG} parser is defined as a list of named rules, each of
20which matches text patterns, and/or contains references to other
21rules. Parsing is initiated with the function @code{peg-run} or the
22macro @code{peg-parse} (see below), and parses text after point in the
23current buffer, using a given set of rules.
24
25@cindex parsing expression
26The definition of each rule is referred to as a @dfn{parsing
27expression} (@acronym{PEX}), and can consist of a literal string, a
28regexp-like character range or set, a peg-specific construct
29resembling an elisp function call, a reference to another rule, or a
30combination of any of these. A grammar is expressed as a tree of
31rules in which one rule is typically treated as a ``root'' or
32``entry-point'' rule. For instance:
33
34@example
35@group
36((number sign digit (* digit))
37 (sign (or "+" "-" ""))
38 (digit [0-9]))
39@end group
40@end example
41
42Once defined, grammars can be used to parse text after point in the
43current buffer, in the following ways:
44
45@defmac peg-parse &rest pexs
46Match @var{pexs} at point. If @var{pexs} is a list of PEG rules, the
47first rule is considered the ``entry-point'':
48@end defmac
49
50@example
51@group
52(peg-parse
53 ((number sign digit (* digit))
54 (sign (or "+" "-" ""))
55 (digit [0-9])))
56@end group
57@end example
58
59This macro represents the simplest use of the @acronym{PEG} library,
60but also the least flexible, as the rules must be written directly
61into the source code. A more flexible approach involves use of three
62macros in conjunction: @code{with-peg-rules}, a @code{let}-like
63construct that makes a set of rules available within the macro body;
64@code{peg-run}, which initiates parsing given a single rule; and
65@code{peg}, which is used to wrap the entry-point rule name. In fact,
66a call to @code{peg-parse} expands to just this set of calls. The
67above example could be written as:
68
69@example
70@group
71(with-peg-rules
72 ((number sign digit (* digit))
73 (sign (or "+" "-" ""))
74 (digit [0-9]))
75 (peg-run (peg number)))
76@end group
77@end example
78
79This allows more explicit control over the ``entry-point'' of parsing,
80and allows the combination of rules from different sources.
81
82Individual rules can also be defined using a more @code{defun}-like
83syntax, using the macro @code{define-peg-rule}:
84
85@example
86(define-peg-rule digit ()
87 [0-9])
88@end example
89
90This also allows for rules that accept an argument (supplied by the
91@code{funcall} PEG rule).
92
93Another possibility is to define a named set of rules with
94@code{define-peg-ruleset}:
95
96@example
97(define-peg-ruleset number-grammar
98 '((number sign digit (* digit))
99 digit ;; A reference to the definition above.
100 (sign (or "+" "-" ""))))
101@end example
102
103Rules and rulesets defined this way can be referred to by name in
104later calls to @code{peg-run} or @code{with-peg-rules}:
105
106@example
107(with-peg-rules number-grammar
108 (peg-run (peg number)))
109@end example
110
111By default, calls to @code{peg-run} or @code{peg-parse} produce no
112output: parsing simply moves point. In order to return or otherwise
113act upon parsed strings, rules can include @dfn{actions}, see
114@ref{Parsing Actions}.
115
116@menu
117* PEX Definitions:: The syntax of PEX rules.
118* Parsing Actions:: Running actions upon successful parsing.
119* Writing PEG Rules:: Tips for writing parsing rules.
120@end menu
121
122@node PEX Definitions
123@section PEX Definitions
124
125Parsing expressions can be defined using the following syntax:
126
127@table @code
128@item (and E1 E2 ...)
129A sequence of @acronym{PEX}s that must all be matched. The @code{and} form is
130optional and implicit.
131
132@item (or E1 E2 ...)
133Prioritized choices, meaning that, as in Elisp, the choices are tried
134in order, and the first successful match is used. Note that this is
135distinct from context-free grammars, in which selection between
136multiple matches is indeterminate.
137
138@item (any)
139Matches any single character, as the regexp ``.''.
140
141@item @var{string}
142A literal string.
143
144@item (char @var{C})
145A single character @var{C}, as an Elisp character literal.
146
147@item (* @var{E})
148Zero or more instances of expression @var{E}, as the regexp @samp{*}.
149Matching is always ``greedy''.
150
151@item (+ @var{E})
152One or more instances of expression @var{E}, as the regexp @samp{+}.
153Matching is always ``greedy''.
154
155@item (opt @var{E})
156Zero or one instance of expression @var{E}, as the regexp @samp{?}.
157
158@item SYMBOL
159A symbol representing a previously-defined PEG rule.
160
161@item (range CH1 CH2)
162The character range between CH1 and CH2, as the regexp @samp{[CH1-CH2]}.
163
164@item [CH1-CH2 "+*" ?x]
165A character set, which can include ranges, character literals, or
166strings of characters.
167
168@item [ascii cntrl]
169A list of named character classes.
170
171@item (syntax-class @var{NAME})
172A single syntax class.
173
174@item (funcall E ARGS...)
175Call @acronym{PEX} E (previously defined with @code{define-peg-rule})
176with arguments @var{ARGS}.
177
178@item (null)
179The empty string.
180
181@end table
182
183The following expressions are used as anchors or tests -- they do not
184move point, but return a boolean value which can be used to constrain
185matches as a way of controlling the parsing process (@pxref{Writing
186PEG Rules}).
187
188@table @code
189@item (bob)
190Beginning of buffer.
191
192@item (eob)
193End of buffer.
194
195@item (bol)
196Beginning of line.
197
198@item (eol)
199End of line.
200
201@item (bow)
202Beginning of word.
203
204@item (eow)
205End of word.
206
207@item (bos)
208Beginning of symbol.
209
210@item (eos)
211End of symbol.
212
213@item (if E)
214Returns non-@code{nil} if parsing @acronym{PEX} E from point succeeds (point
215is not moved).
216
217@item (not E)
218Returns non-@code{nil} if parsing @acronym{PEX} E from point fails (point
219is not moved).
220
221@item (guard EXP)
222Treats the value of the Lisp expression EXP as a boolean.
223
224@end table
225
226@vindex peg-char-classes
227Character class matching can use the same named character classes as
228in regular expressions (@pxref{Top,, Character Classes,elisp})
229
230@node Parsing Actions
231@section Parsing Actions
232
233@cindex parsing actions
234@cindex parsing stack
235By default the process of parsing simply moves point in the current
236buffer, ultimately returning @code{t} if the parsing succeeds, and
237@code{nil} if it doesn't. It's also possible to define ``actions''
238that can run arbitrary Elisp at certain points in the parsed text.
239These actions can optionally affect something called the @dfn{parsing
240stack}, which is a list of values returned by the parsing process.
241These actions only run (and only return values) if the parsing process
242ultimately succeeds; if it fails the action code is not run at all.
243
244Actions can be added anywhere in the definition of a rule. They are
245distinguished from parsing expressions by an initial backquote
246(@samp{`}), followed by a parenthetical form that must contain a pair
247of hyphens (@samp{--}) somewhere within it. Symbols to the left of
248the hyphens are bound to values popped from the stack (they are
249somewhat analogous to the argument list of a lambda form). Values
250produced by code to the right are pushed to the stack (analogous to
251the return value of the lambda). For instance, the previous grammar
252can be augmented with actions to return the parsed number as an actual
253integer:
254
255@example
256(with-peg-rules ((number sign digit (* digit
257 `(a b -- (+ (* a 10) b)))
258 `(sign val -- (* sign val)))
259 (sign (or (and "+" `(-- 1))
260 (and "-" `(-- -1))
261 (and "" `(-- 1))))
262 (digit [0-9] `(-- (- (char-before) ?0))))
263 (peg-run (peg number)))
264@end example
265
266There must be values on the stack before they can be popped and
267returned -- if there aren't enough stack values to bind to an action's
268left-hand terms, they will be bound to @code{nil}. An action with
269only right-hand terms will push values to the stack; an action with
270only left-hand terms will consume (and discard) values from the stack.
271At the end of parsing, stack values are returned as a flat list.
272
273To return the string matched by a @acronym{PEX} (instead of simply
274moving point over it), a rule like this can be used:
275
276@example
277(one-word
278 `(-- (point))
279 (+ [word])
280 `(start -- (buffer-substring start (point))))
281@end example
282
283The first action pushes the initial value of point to the stack. The
284intervening @acronym{PEX} moves point over the next word. The second
285action pops the previous value from the stack (binding it to the
286variable @code{start}), and uses that value to extract a substring
287from the buffer and push it to the stack. This pattern is so common
288that @acronym{PEG} provides a shorthand function that does exactly the
289above, along with a few other shorthands for common scenarios:
290
291@table @code
292@item (substring @var{E})
293Match @acronym{PEX} @var{E} and push the matched string to the stack.
294
295@item (region @var{E})
296Match @var{E} and push the start and end positions of the matched
297region to the stack.
298
299@item (replace @var{E} @var{replacement})
300Match @var{E} and replaced the matched region with the string @var{replacement}.
301
302@item (list @var{E})
303Match @var{E}, collect all values produced by @var{E} (and its
304sub-expressions) into a list, and push that list to the stack. Stack
305values are typically returned as a flat list; this is a way of
306``grouping'' values together.
307@end table
308
309@node Writing PEG Rules
310@section Writing PEG Rules
311
312Something to be aware of when writing PEG rules is that they are
313greedy. Rules which can consume a variable amount of text will always
314consume the maximum amount possible, even if that causes a rule that
315might otherwise have matched to fail later on -- there is no
316backtracking. For instance, this rule will never succeed:
317
318@example
319(forest (+ "tree" (* [blank])) "tree" (eol))
320@end example
321
322The @acronym{PEX} @code{(+ "tree" (* [blank]))} will consume all
323repetitions of the word ``tree'', leaving none to match the final
324@code{"tree"}.
325
326In these situations, the desired result can be obtained by using
327predicates and guards -- namely the @code{not}, @code{if} and
328@code{guard} expressions -- to constrain behavior. For instance:
329
330@example
331(forest (+ "tree" (* [blank])) (not (eol)) "tree" (eol))
332@end example
333
334The @code{if} and @code{not} operators accept a parsing expression and
335interpret it as a boolean, without moving point. The contents of a
336@code{guard} operator are evaluated as regular Lisp (not a
337@acronym{PEX}) and should return a boolean value. A @code{nil} value
338causes the match to fail.
339
340Another potentially unexpected behavior is that parsing will move
341point as far as possible, even if the parsing ultimately fails. This
342rule:
343
344@example
345(end-game "game" (eob))
346@end example
347
348when run in a buffer containing the text ``game over'' after point,
349will move point to just after ``game'' then halt parsing, returning
350@code{nil}. Successful parsing will always return @code{t}, or the
351contexts of the parsing stack.
diff --git a/etc/NEWS b/etc/NEWS
index 8ccf04276f6..8e1c1082b3a 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1585,6 +1585,14 @@ forwards-compatibility Compat package from GNU ELPA. This allows
1585built-in packages to use the library more effectively, and helps 1585built-in packages to use the library more effectively, and helps
1586preventing the installation of Compat if unnecessary. 1586preventing the installation of Compat if unnecessary.
1587 1587
1588+++
1589** New package PEG.
1590Emacs now includes a library for writing (P)arsing (E)xpression
1591(G)rammars, an approach to text parsing that provides more structure
1592than regular expressions, but less complexity than context-free
1593grammars. The Info manual "(elisp) Parsing Expression Grammars" has
1594documentation and examples.
1595
1588 1596
1589* Incompatible Lisp Changes in Emacs 30.1 1597* Incompatible Lisp Changes in Emacs 30.1
1590 1598
diff --git a/lisp/progmodes/peg.el b/lisp/progmodes/peg.el
new file mode 100644
index 00000000000..2eb4a7384d0
--- /dev/null
+++ b/lisp/progmodes/peg.el
@@ -0,0 +1,944 @@
1;;; peg.el --- Parsing Expression Grammars in Emacs Lisp -*- lexical-binding:t -*-
2
3;; Copyright (C) 2008-2023 Free Software Foundation, Inc.
4;;
5;; Author: Helmut Eller <eller.helmut@gmail.com>
6;; Maintainer: Stefan Monnier <monnier@iro.umontreal.ca>
7;; Version: 1.0.1
8;;
9;; This program is free software: you can redistribute it and/or modify
10;; it under the terms of the GNU General Public License as published by
11;; the Free Software Foundation, either version 3 of the License, or
12;; (at your option) any later version.
13;;
14;; This program is distributed in the hope that it will be useful,
15;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17;; GNU General Public License for more details.
18;;
19;; You should have received a copy of the GNU General Public License
20;; along with this program. If not, see <https://www.gnu.org/licenses/>.
21;;
22;;; Commentary:
23;;
24;; This package implements Parsing Expression Grammars for Emacs Lisp.
25
26;; Parsing Expression Grammars (PEG) are a formalism in the spirit of
27;; Context Free Grammars (CFG) with some simplifications which makes
28;; the implementation of PEGs as recursive descent parsers particularly
29;; simple and easy to understand [Ford, Baker].
30;; PEGs are more expressive than regexps and potentially easier to use.
31;;
32;; This file implements the macros `define-peg-rule', `with-peg-rules', and
33;; `peg-parse' which parses the current buffer according to a PEG.
34;; E.g. we can match integers with:
35;;
36;; (with-peg-rules
37;; ((number sign digit (* digit))
38;; (sign (or "+" "-" ""))
39;; (digit [0-9]))
40;; (peg-run (peg number)))
41;; or
42;; (define-peg-rule digit ()
43;; [0-9])
44;; (peg-parse (number sign digit (* digit))
45;; (sign (or "+" "-" "")))
46;;
47;; In contrast to regexps, PEGs allow us to define recursive "rules".
48;; A "grammar" is a set of rules. A rule is written as (NAME PEX...)
49;; E.g. (sign (or "+" "-" "")) is a rule with the name "sign".
50;; The syntax for PEX (Parsing Expression) is a follows:
51;;
52;; Description Lisp Traditional, as in Ford's paper
53;; =========== ==== ===========
54;; Sequence (and E1 E2) e1 e2
55;; Prioritized Choice (or E1 E2) e1 / e2
56;; Not-predicate (not E) !e
57;; And-predicate (if E) &e
58;; Any character (any) .
59;; Literal string "abc" "abc"
60;; Character C (char C) 'c'
61;; Zero-or-more (* E) e*
62;; One-or-more (+ E) e+
63;; Optional (opt E) e?
64;; Non-terminal SYMBOL A
65;; Character range (range A B) [a-b]
66;; Character set [a-b "+*" ?x] [a-b+*x] ;Note: it's a vector
67;; Character classes [ascii cntrl]
68;; Boolean-guard (guard EXP)
69;; Syntax-Class (syntax-class NAME)
70;; Local definitions (with RULES PEX...)
71;; Indirect call (funcall EXP ARGS...)
72;; and
73;; Empty-string (null) ε
74;; Beginning-of-Buffer (bob)
75;; End-of-Buffer (eob)
76;; Beginning-of-Line (bol)
77;; End-of-Line (eol)
78;; Beginning-of-Word (bow)
79;; End-of-Word (eow)
80;; Beginning-of-Symbol (bos)
81;; End-of-Symbol (eos)
82;;
83;; Rules can refer to other rules, and a grammar is often structured
84;; as a tree, with a root rule referring to one or more "branch
85;; rules", all the way down to the "leaf rules" that deal with actual
86;; buffer text. Rules can be recursive or mutually referential,
87;; though care must be taken not to create infinite loops.
88;;
89;;;; Named rulesets:
90;;
91;; You can define a set of rules for later use with:
92;;
93;; (define-peg-ruleset myrules
94;; (sign () (or "+" "-" ""))
95;; (digit () [0-9])
96;; (nat () digit (* digit))
97;; (int () sign digit (* digit))
98;; (float () int "." nat))
99;;
100;; and later refer to it:
101;;
102;; (with-peg-rules
103;; (myrules
104;; (complex float "+i" float))
105;; ... (peg-parse nat "," nat "," complex) ...)
106;;
107;;;; Parsing actions:
108;;
109;; PEXs also support parsing actions, i.e. Lisp snippets which are
110;; executed when a pex matches. This can be used to construct syntax
111;; trees or for similar tasks. The most basic form of action is
112;; written as:
113;;
114;; (action FORM) ; evaluate FORM for its side-effects
115;;
116;; Actions don't consume input, but are executed at the point of
117;; match. Another kind of action is called a "stack action", and
118;; looks like this:
119;;
120;; `(VAR... -- FORM...) ; stack action
121;;
122;; A stack action takes VARs from the "value stack" and pushes the
123;; results of evaluating FORMs to that stack.
124
125;; The value stack is created during the course of parsing. Certain
126;; operators (see below) that match buffer text can push values onto
127;; this stack. "Upstream" rules can then draw values from the stack,
128;; and optionally push new ones back. For instance, consider this
129;; very simple grammar:
130;;
131;; (with-peg-rules
132;; ((query (+ term) (eol))
133;; (term key ":" value (opt (+ [space]))
134;; `(k v -- (cons (intern k) v)))
135;; (key (substring (and (not ":") (+ [word]))))
136;; (value (or string-value number-value))
137;; (string-value (substring (+ [alpha])))
138;; (number-value (substring (+ [digit]))
139;; `(val -- (string-to-number val))))
140;; (peg-run (peg query)))
141;;
142;; This invocation of `peg-run' would parse this buffer text:
143;;
144;; name:Jane age:30
145;;
146;; And return this Elisp sexp:
147;;
148;; ((age . 30) (name . "Jane"))
149;;
150;; Note that, in complex grammars, some care must be taken to make
151;; sure that the number and type of values drawn from the stack always
152;; match those pushed. In the example above, both `string-value' and
153;; `number-value' push a single value to the stack. Since the `value'
154;; rule only includes these two sub-rules, any upstream rule that
155;; makes use of `value' can be confident it will always and only push
156;; a single value to the stack.
157;;
158;; Stack action forms are in a sense analogous to lambda forms: the
159;; symbols before the "--" are the equivalent of lambda arguments,
160;; while the forms after the "--" are return values. The difference
161;; being that a lambda form can only return a single value, while a
162;; stack action can push multiple values onto the stack. It's also
163;; perfectly valid to use `(-- FORM...)' or `(VAR... --)': the former
164;; pushes values to the stack without consuming any, and the latter
165;; pops values from the stack and discards them.
166;;
167;;;; Derived Operators:
168;;
169;; The following operators are implemented as combinations of
170;; primitive expressions:
171;;
172;; (substring E) ; Match E and push the substring for the matched region.
173;; (region E) ; Match E and push the start and end positions.
174;; (replace E RPL); Match E and replace the matched region with RPL.
175;; (list E) ; Match E and push a list of the items that E produced.
176;;
177;; See `peg-ex-parse-int' in `peg-tests.el' for further examples.
178;;
179;; Regexp equivalents:
180;;
181;; Here a some examples for regexps and how those could be written as pex.
182;; [Most are taken from rx.el]
183;;
184;; "^[a-z]*"
185;; (and (bol) (* [a-z]))
186;;
187;; "\n[^ \t]"
188;; (and "\n" (not [" \t"]) (any))
189;;
190;; "\\*\\*\\* EOOH \\*\\*\\*\n"
191;; "*** EOOH ***\n"
192;;
193;; "\\<\\(catch\\|finally\\)\\>[^_]"
194;; (and (bow) (or "catch" "finally") (eow) (not "_") (any))
195;;
196;; "[ \t\n]*:\\([^:]+\\|$\\)"
197;; (and (* [" \t\n"]) ":" (or (+ (not ":") (any)) (eol)))
198;;
199;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
200;; (and (bol)
201;; "content-transfer-encoding:"
202;; (* (opt "\n") ["\t "])
203;; "quoted-printable"
204;; (* (opt "\n") ["\t "]))
205;;
206;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
207;; (and "$Id: " (+ (not " ") (any)) " " (+ (not " ") (any)) " ")
208;;
209;; "^;;\\s-*\n\\|^\n"
210;; (or (and (bol) ";;" (* (syntax-class whitespace)) "\n")
211;; (and (bol) "\n"))
212;;
213;; "\\\\\\\\\\[\\w+"
214;; (and "\\\\[" (+ (syntax-class word)))
215;;
216;; See ";;; Examples" in `peg-tests.el' for other examples.
217;;
218;;;; Rule argument and indirect calls:
219;;
220;; Rules can take arguments and those arguments can themselves be PEGs.
221;; For example:
222;;
223;; (define-peg-rule 2-or-more (peg)
224;; (funcall peg)
225;; (funcall peg)
226;; (* (funcall peg)))
227;;
228;; ... (peg-parse
229;; ...
230;; (2-or-more (peg foo))
231;; ...
232;; (2-or-more (peg bar))
233;; ...)
234;;
235;;;; References:
236;;
237;; [Ford] Bryan Ford. Parsing Expression Grammars: a Recognition-Based
238;; Syntactic Foundation. In POPL'04: Proceedings of the 31st ACM
239;; SIGPLAN-SIGACT symposium on Principles of Programming Languages,
240;; pages 111-122, New York, NY, USA, 2004. ACM Press.
241;; http://pdos.csail.mit.edu/~baford/packrat/
242;;
243;; [Baker] Baker, Henry G. "Pragmatic Parsing in Common Lisp". ACM Lisp
244;; Pointers 4(2), April--June 1991, pp. 3--15.
245;; http://home.pipeline.com/~hbaker1/Prag-Parse.html
246;;
247;; Roman Redziejowski does good PEG related research
248;; http://www.romanredz.se/pubs.htm
249
250;;;; Todo:
251
252;; - Fix the exponential blowup in `peg-translate-exp'.
253;; - Add a proper debug-spec for PEXs.
254
255;;; News:
256
257;; Since 1.0.1:
258;; - Use OClosures to represent PEG rules when available, and let cl-print
259;; display their source code.
260;; - New PEX form (with RULES PEX...).
261;; - Named rulesets.
262;; - You can pass arguments to rules.
263;; - New `funcall' rule to call rules indirectly (e.g. a peg you received
264;; as argument).
265
266;; Version 1.0:
267;; - New official entry points `peg` and `peg-run`.
268
269;;; Code:
270
271(eval-when-compile (require 'cl-lib))
272
273(defvar peg--actions nil
274 "Actions collected along the current parse.
275Used at runtime for backtracking. It's a list ((POS . THUNK)...).
276Each THUNK is executed at the corresponding POS. Thunks are
277executed in a postprocessing step, not during parsing.")
278
279(defvar peg--errors nil
280 "Data keeping track of the rightmost parse failure location.
281It's a pair (POSITION . EXPS ...). POSITION is the buffer position and
282EXPS is a list of rules/expressions that failed.")
283
284;;;; Main entry points
285
286(defmacro peg--when-fboundp (f &rest body)
287 (declare (indent 1) (debug (sexp body)))
288 (when (fboundp f)
289 (macroexp-progn body)))
290
291(peg--when-fboundp oclosure-define
292 (oclosure-define peg-function
293 "Parsing function built from PEG rule."
294 pexs)
295
296 (cl-defmethod cl-print-object ((peg peg-function) stream)
297 (princ "#f<peg " stream)
298 (let ((args (help-function-arglist peg 'preserve-names)))
299 (if args
300 (prin1 args stream)
301 (princ "()" stream)))
302 (princ " " stream)
303 (prin1 (peg-function--pexs peg) stream)
304 (princ ">" stream)))
305
306(defmacro peg--lambda (pexs args &rest body)
307 (declare (indent 2)
308 (debug (&define form lambda-list def-body)))
309 (if (fboundp 'oclosure-lambda)
310 `(oclosure-lambda (peg-function (pexs ,pexs)) ,args . ,body)
311 `(lambda ,args . ,body)))
312
313;; Sometimes (with-peg-rules ... (peg-run (peg ...))) is too
314;; longwinded for the task at hand, so `peg-parse' comes in handy.
315(defmacro peg-parse (&rest pexs)
316 "Match PEXS at point.
317PEXS is a sequence of PEG expressions, implicitly combined with `and'.
318Returns STACK if the match succeed and signals an error on failure,
319moving point along the way.
320PEXS can also be a list of PEG rules, in which case the first rule is used."
321 (if (and (consp (car pexs))
322 (symbolp (caar pexs))
323 (not (ignore-errors (peg-normalize (car pexs)))))
324 ;; `pexs' is a list of rules: use the first rule as entry point.
325 `(with-peg-rules ,pexs (peg-run (peg ,(caar pexs)) #'peg-signal-failure))
326 `(peg-run (peg ,@pexs) #'peg-signal-failure)))
327
328(defmacro peg (&rest pexs)
329 "Return a PEG-matcher that matches PEXS."
330 (pcase (peg-normalize `(and . ,pexs))
331 (`(call ,name) `#',(peg--rule-id name)) ;Optimize this case by η-reduction!
332 (exp `(peg--lambda ',pexs () ,(peg-translate-exp exp)))))
333
334;; There are several "infos we want to return" when parsing a given PEX:
335;; 1- We want to return the success/failure of the parse.
336;; 2- We want to return the data of the successful parse (the stack).
337;; 3- We want to return the diagnostic of the failures.
338;; 4- We want to perform the actions (upon parse success)!
339;; `peg-parse' used an error signal to encode the (1) boolean, which
340;; lets it return all the info conveniently but the error signal was sometimes
341;; inconvenient. Other times one wants to just know (1) maybe without even
342;; performing (4).
343;; `peg-run' lets you choose all that, and by default gives you
344;; (1) as a simple boolean, while also doing (2), and (4).
345
346(defun peg-run (peg-matcher &optional failure-function success-function)
347 "Parse with PEG-MATCHER at point and run the success/failure function.
348If a match was found, move to the end of the match and call SUCCESS-FUNCTION
349with one argument: a function which will perform all the actions collected
350during the parse and then return the resulting stack (or t if empty).
351If no match was found, move to the (rightmost) point of parse failure and call
352FAILURE-FUNCTION with one argument, which is a list of PEG expressions that
353failed at this point.
354SUCCESS-FUNCTION defaults to `funcall' and FAILURE-FUNCTION
355defaults to `ignore'."
356 (let ((peg--actions '()) (peg--errors '(-1)))
357 (if (funcall peg-matcher)
358 ;; Found a parse: run the actions collected along the way.
359 (funcall (or success-function #'funcall)
360 (lambda ()
361 (save-excursion (peg-postprocess peg--actions))))
362 (goto-char (car peg--errors))
363 (when failure-function
364 (funcall failure-function (peg-merge-errors (cdr peg--errors)))))))
365
366(defmacro define-peg-rule (name args &rest pexs)
367 "Define PEG rule NAME as equivalent to PEXS.
368The PEG expressions in PEXS are implicitly combined with the
369sequencing `and' operator of PEG grammars."
370 (declare (indent 1))
371 (let ((inline nil))
372 (while (keywordp (car pexs))
373 (pcase (pop pexs)
374 (:inline (setq inline (car pexs))))
375 (setq pexs (cdr pexs)))
376 (let ((id (peg--rule-id name))
377 (exp (peg-normalize `(and . ,pexs))))
378 `(progn
379 (defalias ',id
380 (peg--lambda ',pexs ,args
381 ,(if inline
382 ;; Short-circuit to peg--translate in order to skip
383 ;; the extra failure-recording of `peg-translate-exp'.
384 ;; It also skips the cycle detection of
385 ;; `peg--translate-rule-body', which is not the main
386 ;; purpose but we can live with it.
387 (apply #'peg--translate exp)
388 (peg--translate-rule-body name exp))))
389 (eval-and-compile
390 ;; FIXME: We shouldn't need this any more since the info is now
391 ;; stored in the function, but sadly we need to find a name's EXP
392 ;; during compilation (i.e. before the `defalias' is executed)
393 ;; as part of cycle-detection!
394 (put ',id 'peg--rule-definition ',exp)
395 ,@(when inline
396 ;; FIXME: Copied from `defsubst'.
397 `(;; Never native-compile defsubsts as we need the byte
398 ;; definition in `byte-compile-unfold-bcf' to perform the
399 ;; inlining (Bug#42664, Bug#43280, Bug#44209).
400 ,(byte-run--set-speed id nil -1)
401 (put ',id 'byte-optimizer #'byte-compile-inline-expand))))))))
402
403(defmacro define-peg-ruleset (name &rest rules)
404 "Define a set of PEG rules for later use, e.g., in `with-peg-rules'."
405 (declare (indent 1))
406 (let ((defs ())
407 (aliases ()))
408 (dolist (rule rules)
409 (let* ((rname (car rule))
410 (full-rname (format "%s %s" name rname)))
411 (push `(define-peg-rule ,full-rname . ,(cdr rule)) defs)
412 (push `(,(peg--rule-id rname) #',(peg--rule-id full-rname)) aliases)))
413 `(cl-flet ,aliases
414 ,@defs
415 (eval-and-compile (put ',name 'peg--rules ',aliases)))))
416
417(defmacro with-peg-rules (rules &rest body)
418 "Make PEG rules RULES available within the scope of BODY.
419RULES is a list of rules of the form (NAME . PEXS), where PEXS is a sequence
420of PEG expressions, implicitly combined with `and'.
421RULES can also contain symbols in which case these must name
422rulesets defined previously with `define-peg-ruleset'."
423 (declare (indent 1) (debug (sexp form))) ;FIXME: `sexp' is not good enough!
424 (let* ((rulesets nil)
425 (rules
426 ;; First, macroexpand the rules.
427 (delq nil
428 (mapcar (lambda (rule)
429 (if (symbolp rule)
430 (progn (push rule rulesets) nil)
431 (cons (car rule) (peg-normalize `(and . ,(cdr rule))))))
432 rules)))
433 (ctx (assq :peg-rules macroexpand-all-environment)))
434 (macroexpand-all
435 `(cl-labels
436 ,(mapcar (lambda (rule)
437 ;; FIXME: Use `peg--lambda' as well.
438 `(,(peg--rule-id (car rule))
439 ()
440 ,(peg--translate-rule-body (car rule) (cdr rule))))
441 rules)
442 ,@body)
443 `((:peg-rules ,@(append rules (cdr ctx)))
444 ,@macroexpand-all-environment))))
445
446;;;;; Old entry points
447
448(defmacro peg-parse-exp (exp)
449 "Match the parsing expression EXP at point."
450 (declare (obsolete peg-parse "peg-0.9"))
451 `(peg-run (peg ,exp)))
452
453;;;; The actual implementation
454
455(defun peg--lookup-rule (name)
456 (or (cdr (assq name (cdr (assq :peg-rules macroexpand-all-environment))))
457 ;; With `peg-function' objects, we can recover the PEG from which it was
458 ;; defined, but this info is not yet available at compile-time. :-(
459 ;;(let ((id (peg--rule-id name)))
460 ;; (peg-function--pexs (symbol-function id)))
461 (get (peg--rule-id name) 'peg--rule-definition)))
462
463(defun peg--rule-id (name)
464 (intern (format "peg-rule %s" name)))
465
466(define-error 'peg-search-failed "Parse error at %d (expecting %S)")
467
468(defun peg-signal-failure (failures)
469 (signal 'peg-search-failed (list (point) failures)))
470
471(defun peg-parse-at-point (peg-matcher)
472 "Parse text at point according to the PEG rule PEG-MATCHER."
473 (declare (obsolete peg-run "peg-1.0"))
474 (peg-run peg-matcher
475 #'peg-signal-failure
476 (lambda (f) (let ((r (funcall f))) (if (listp r) r)))))
477
478;; Internally we use a regularized syntax, e.g. we only have binary OR
479;; nodes. Regularized nodes are lists of the form (OP ARGS...).
480(cl-defgeneric peg-normalize (exp)
481 "Return a \"normalized\" form of EXP."
482 (error "Invalid parsing expression: %S" exp))
483
484(cl-defmethod peg-normalize ((exp string))
485 (let ((len (length exp)))
486 (cond ((zerop len) '(guard t))
487 ((= len 1) `(char ,(aref exp 0)))
488 (t `(str ,exp)))))
489
490(cl-defmethod peg-normalize ((exp symbol))
491 ;; (peg--lookup-rule exp)
492 `(call ,exp))
493
494(cl-defmethod peg-normalize ((exp vector))
495 (peg-normalize `(set . ,(append exp '()))))
496
497(cl-defmethod peg-normalize ((exp cons))
498 (apply #'peg--macroexpand exp))
499
500(defconst peg-leaf-types '(any call action char range str set
501 guard syntax-class = funcall))
502
503(cl-defgeneric peg--macroexpand (head &rest args)
504 (cond
505 ((memq head peg-leaf-types) (cons head args))
506 (t `(call ,head ,@args))))
507
508(cl-defmethod peg--macroexpand ((_ (eql or)) &rest args)
509 (cond ((null args) '(guard nil))
510 ((null (cdr args)) (peg-normalize (car args)))
511 (t `(or ,(peg-normalize (car args))
512 ,(peg-normalize `(or . ,(cdr args)))))))
513
514(cl-defmethod peg--macroexpand ((_ (eql and)) &rest args)
515 (cond ((null args) '(guard t))
516 ((null (cdr args)) (peg-normalize (car args)))
517 (t `(and ,(peg-normalize (car args))
518 ,(peg-normalize `(and . ,(cdr args)))))))
519
520(cl-defmethod peg--macroexpand ((_ (eql *)) &rest args)
521 `(* ,(peg-normalize `(and . ,args))))
522
523;; FIXME: this duplicates code; could use some loop to avoid that
524(cl-defmethod peg--macroexpand ((_ (eql +)) &rest args)
525 (let ((e (peg-normalize `(and . ,args))))
526 `(and ,e (* ,e))))
527
528(cl-defmethod peg--macroexpand ((_ (eql opt)) &rest args)
529 (let ((e (peg-normalize `(and . ,args))))
530 `(or ,e (guard t))))
531
532(cl-defmethod peg--macroexpand ((_ (eql if)) &rest args)
533 `(if ,(peg-normalize `(and . ,args))))
534
535(cl-defmethod peg--macroexpand ((_ (eql not)) &rest args)
536 `(not ,(peg-normalize `(and . ,args))))
537
538(cl-defmethod peg--macroexpand ((_ (eql \`)) form)
539 (peg-normalize `(stack-action ,form)))
540
541(cl-defmethod peg--macroexpand ((_ (eql stack-action)) form)
542 (unless (member '-- form)
543 (error "Malformed stack action: %S" form))
544 (let ((args (cdr (member '-- (reverse form))))
545 (values (cdr (member '-- form))))
546 (let ((form `(let ,(mapcar (lambda (var) `(,var (pop peg--stack))) args)
547 ,@(mapcar (lambda (val) `(push ,val peg--stack)) values))))
548 `(action ,form))))
549
550(defvar peg-char-classes
551 '(ascii alnum alpha blank cntrl digit graph lower multibyte nonascii print
552 punct space unibyte upper word xdigit))
553
554(cl-defmethod peg--macroexpand ((_ (eql set)) &rest specs)
555 (cond ((null specs) '(guard nil))
556 ((and (null (cdr specs))
557 (let ((range (peg-range-designator (car specs))))
558 (and range `(range ,(car range) ,(cdr range))))))
559 (t
560 (let ((chars '()) (ranges '()) (classes '()))
561 (while specs
562 (let* ((spec (pop specs))
563 (range (peg-range-designator spec)))
564 (cond (range
565 (push range ranges))
566 ((peg-characterp spec)
567 (push spec chars))
568 ((stringp spec)
569 (setq chars (append (reverse (append spec ())) chars)))
570 ((memq spec peg-char-classes)
571 (push spec classes))
572 (t (error "Invalid set specifier: %S" spec)))))
573 (setq ranges (reverse ranges))
574 (setq chars (delete-dups (reverse chars)))
575 (setq classes (reverse classes))
576 (cond ((and (null ranges)
577 (null classes)
578 (cond ((null chars) '(guard nil))
579 ((null (cdr chars)) `(char ,(car chars))))))
580 (t `(set ,ranges ,chars ,classes)))))))
581
582(defun peg-range-designator (x)
583 (and (symbolp x)
584 (let ((str (symbol-name x)))
585 (and (= (length str) 3)
586 (eq (aref str 1) ?-)
587 (< (aref str 0) (aref str 2))
588 (cons (aref str 0) (aref str 2))))))
589
590;; characterp is new in Emacs 23.
591(defun peg-characterp (x)
592 (if (fboundp 'characterp)
593 (characterp x)
594 (integerp x)))
595
596(cl-defmethod peg--macroexpand ((_ (eql list)) &rest args)
597 (peg-normalize
598 (let ((marker (make-symbol "magic-marker")))
599 `(and (stack-action (-- ',marker))
600 ,@args
601 (stack-action (--
602 (let ((l '()))
603 (while
604 (let ((e (pop peg--stack)))
605 (cond ((eq e ',marker) nil)
606 ((null peg--stack)
607 (error "No marker on stack"))
608 (t (push e l) t))))
609 l)))))))
610
611(cl-defmethod peg--macroexpand ((_ (eql substring)) &rest args)
612 (peg-normalize
613 `(and `(-- (point))
614 ,@args
615 `(start -- (buffer-substring-no-properties start (point))))))
616
617(cl-defmethod peg--macroexpand ((_ (eql region)) &rest args)
618 (peg-normalize
619 `(and `(-- (point))
620 ,@args
621 `(-- (point)))))
622
623(cl-defmethod peg--macroexpand ((_ (eql replace)) pe replacement)
624 (peg-normalize
625 `(and (stack-action (-- (point)))
626 ,pe
627 (stack-action (start -- (progn
628 (delete-region start (point))
629 (insert-before-markers ,replacement))))
630 (stack-action (_ --)))))
631
632(cl-defmethod peg--macroexpand ((_ (eql quote)) _form)
633 (error "quote is reserved for future use"))
634
635(cl-defgeneric peg--translate (head &rest args)
636 (error "No translator for: %S" (cons head args)))
637
638(defun peg--translate-rule-body (name exp)
639 (let ((msg (condition-case err
640 (progn (peg-detect-cycles exp (list name)) nil)
641 (error (error-message-string err))))
642 (code (peg-translate-exp exp)))
643 (cond
644 ((null msg) code)
645 ((fboundp 'macroexp--warn-and-return)
646 (macroexp--warn-and-return msg code))
647 (t
648 (message "%s" msg)
649 code))))
650
651;; This is the main translation function.
652(defun peg-translate-exp (exp)
653 "Return the ELisp code to match the PE EXP."
654 ;; FIXME: This expansion basically duplicates `exp' in the output, which is
655 ;; a serious problem because it's done recursively, so it makes the output
656 ;; code's size exponentially larger than the input!
657 `(or ,(apply #'peg--translate exp)
658 (peg--record-failure ',exp))) ; for error reporting
659
660(define-obsolete-function-alias 'peg-record-failure
661 #'peg--record-failure "peg-1.0")
662(defun peg--record-failure (exp)
663 (cond ((= (point) (car peg--errors))
664 (setcdr peg--errors (cons exp (cdr peg--errors))))
665 ((> (point) (car peg--errors))
666 (setq peg--errors (list (point) exp))))
667 nil)
668
669(cl-defmethod peg--translate ((_ (eql and)) e1 e2)
670 `(and ,(peg-translate-exp e1)
671 ,(peg-translate-exp e2)))
672
673;; Choicepoints are used for backtracking. At a choicepoint we save
674;; enough state, so that we can continue from there if needed.
675(defun peg--choicepoint-moved-p (choicepoint)
676 `(/= ,(car choicepoint) (point)))
677
678(defun peg--choicepoint-restore (choicepoint)
679 `(progn
680 (goto-char ,(car choicepoint))
681 (setq peg--actions ,(cdr choicepoint))))
682
683(defmacro peg--with-choicepoint (var &rest body)
684 (declare (indent 1) (debug (symbolp form)))
685 `(let ((,var (cons (make-symbol "point") (make-symbol "actions"))))
686 `(let ((,(car ,var) (point))
687 (,(cdr ,var) peg--actions))
688 ,@(list ,@body))))
689
690(cl-defmethod peg--translate ((_ (eql or)) e1 e2)
691 (peg--with-choicepoint cp
692 `(or ,(peg-translate-exp e1)
693 (,@(peg--choicepoint-restore cp)
694 ,(peg-translate-exp e2)))))
695
696(cl-defmethod peg--translate ((_ (eql with)) rules &rest exps)
697 `(with-peg-rules ,rules ,(peg--translate `(and . ,exps))))
698
699(cl-defmethod peg--translate ((_ (eql guard)) exp) exp)
700
701(defvar peg-syntax-classes
702 '((whitespace ?-) (word ?w) (symbol ?s) (punctuation ?.)
703 (open ?\() (close ?\)) (string ?\") (escape ?\\) (charquote ?/)
704 (math ?$) (prefix ?') (comment ?<) (endcomment ?>)
705 (comment-fence ?!) (string-fence ?|)))
706
707(cl-defmethod peg--translate ((_ (eql syntax-class)) class)
708 (let ((probe (assoc class peg-syntax-classes)))
709 (cond (probe `(when (looking-at ,(format "\\s%c" (cadr probe)))
710 (forward-char)
711 t))
712 (t (error "Invalid syntax class: %S\nMust be one of: %s" class
713 (mapcar #'car peg-syntax-classes))))))
714
715(cl-defmethod peg--translate ((_ (eql =)) string)
716 `(let ((str ,string))
717 (when (zerop (length str))
718 (error "Empty strings not allowed for ="))
719 (search-forward str (+ (point) (length str)) t)))
720
721(cl-defmethod peg--translate ((_ (eql *)) e)
722 `(progn (while ,(peg--with-choicepoint cp
723 `(if ,(peg-translate-exp e)
724 ;; Just as regexps do for the `*' operator,
725 ;; we allow the body of `*' loops to match
726 ;; the empty string, but we don't repeat the loop if
727 ;; we haven't moved, to avoid inf-loops.
728 ,(peg--choicepoint-moved-p cp)
729 ,(peg--choicepoint-restore cp)
730 nil)))
731 t))
732
733(cl-defmethod peg--translate ((_ (eql if)) e)
734 (peg--with-choicepoint cp
735 `(when ,(peg-translate-exp e)
736 ,(peg--choicepoint-restore cp)
737 t)))
738
739(cl-defmethod peg--translate ((_ (eql not)) e)
740 (peg--with-choicepoint cp
741 `(unless ,(peg-translate-exp e)
742 ,(peg--choicepoint-restore cp)
743 t)))
744
745(cl-defmethod peg--translate ((_ (eql any)) )
746 '(when (not (eobp))
747 (forward-char)
748 t))
749
750(cl-defmethod peg--translate ((_ (eql char)) c)
751 `(when (eq (char-after) ',c)
752 (forward-char)
753 t))
754
755(cl-defmethod peg--translate ((_ (eql set)) ranges chars classes)
756 `(when (looking-at ',(peg-make-charset-regexp ranges chars classes))
757 (forward-char)
758 t))
759
760(defun peg-make-charset-regexp (ranges chars classes)
761 (when (and (not ranges) (not classes) (<= (length chars) 1))
762 (error "Bug"))
763 (let ((rbracket (member ?\] chars))
764 (minus (member ?- chars))
765 (hat (member ?^ chars)))
766 (dolist (c '(?\] ?- ?^))
767 (setq chars (remove c chars)))
768 (format "[%s%s%s%s%s%s]"
769 (if rbracket "]" "")
770 (if minus "-" "")
771 (mapconcat (lambda (x) (format "%c-%c" (car x) (cdr x))) ranges "")
772 (mapconcat (lambda (c) (format "[:%s:]" c)) classes "")
773 (mapconcat (lambda (c) (format "%c" c)) chars "")
774 (if hat "^" ""))))
775
776(cl-defmethod peg--translate ((_ (eql range)) from to)
777 `(when (and (char-after)
778 (<= ',from (char-after))
779 (<= (char-after) ',to))
780 (forward-char)
781 t))
782
783(cl-defmethod peg--translate ((_ (eql str)) str)
784 `(when (looking-at ',(regexp-quote str))
785 (goto-char (match-end 0))
786 t))
787
788(cl-defmethod peg--translate ((_ (eql call)) name &rest args)
789 `(,(peg--rule-id name) ,@args))
790
791(cl-defmethod peg--translate ((_ (eql funcall)) exp &rest args)
792 `(funcall ,exp ,@args))
793
794(cl-defmethod peg--translate ((_ (eql action)) form)
795 `(progn
796 (push (cons (point) (lambda () ,form)) peg--actions)
797 t))
798
799(defvar peg--stack nil)
800(defun peg-postprocess (actions)
801 "Execute \"actions\"."
802 (let ((peg--stack '())
803 (forw-actions ()))
804 (pcase-dolist (`(,pos . ,thunk) actions)
805 (push (cons (copy-marker pos) thunk) forw-actions))
806 (pcase-dolist (`(,pos . ,thunk) forw-actions)
807 (goto-char pos)
808 (funcall thunk))
809 (or peg--stack t)))
810
811;; Left recursion is presumably a common mistake when using PEGs.
812;; Here we try to detect such mistakes. Essentially we traverse the
813;; graph as long as we can without consuming input. When we find a
814;; recursive call we signal an error.
815
816(defun peg-detect-cycles (exp path)
817 "Signal an error on a cycle.
818Otherwise traverse EXP recursively and return T if EXP can match
819without consuming input. Return nil if EXP definitely consumes
820input. PATH is the list of rules that we have visited so far."
821 (apply #'peg--detect-cycles path exp))
822
823(cl-defgeneric peg--detect-cycles (head _path &rest args)
824 (error "No detect-cycle method for: %S" (cons head args)))
825
826(cl-defmethod peg--detect-cycles (path (_ (eql call)) name)
827 (if (member name path)
828 (error "Possible left recursion: %s"
829 (mapconcat (lambda (x) (format "%s" x))
830 (reverse (cons name path)) " -> "))
831 (let ((exp (peg--lookup-rule name)))
832 (if (null exp)
833 ;; If there's no rule by that name, either we'll fail at
834 ;; run-time or it will be defined later. In any case, at this
835 ;; point there's no evidence of a cycle, and if a cycle appears
836 ;; later we'll hopefully catch it when the rule gets defined.
837 ;; FIXME: In practice, if `name' is part of the cycle, we will
838 ;; indeed detect it when it gets defined, but OTOH if `name'
839 ;; is not part of a cycle but it *enables* a cycle because
840 ;; it matches the empty string (i.e. we should have returned t
841 ;; here), then we may not catch the problem at all :-(
842 nil
843 (peg-detect-cycles exp (cons name path))))))
844
845(cl-defmethod peg--detect-cycles (path (_ (eql and)) e1 e2)
846 (and (peg-detect-cycles e1 path)
847 (peg-detect-cycles e2 path)))
848
849(cl-defmethod peg--detect-cycles (path (_ (eql or)) e1 e2)
850 (or (peg-detect-cycles e1 path)
851 (peg-detect-cycles e2 path)))
852
853(cl-defmethod peg--detect-cycles (path (_ (eql *)) e)
854 (peg-detect-cycles e path)
855 t)
856
857(cl-defmethod peg--detect-cycles (path (_ (eql if)) e)
858 (peg-unary-nullable e path))
859(cl-defmethod peg--detect-cycles (path (_ (eql not)) e)
860 (peg-unary-nullable e path))
861
862(defun peg-unary-nullable (exp path)
863 (peg-detect-cycles exp path)
864 t)
865
866(cl-defmethod peg--detect-cycles (_path (_ (eql any))) nil)
867(cl-defmethod peg--detect-cycles (_path (_ (eql char)) _c) nil)
868(cl-defmethod peg--detect-cycles (_path (_ (eql set)) _r _c _k) nil)
869(cl-defmethod peg--detect-cycles (_path (_ (eql range)) _c1 _c2) nil)
870(cl-defmethod peg--detect-cycles (_path (_ (eql str)) s) (equal s ""))
871(cl-defmethod peg--detect-cycles (_path (_ (eql guard)) _e) t)
872(cl-defmethod peg--detect-cycles (_path (_ (eql =)) _s) nil)
873(cl-defmethod peg--detect-cycles (_path (_ (eql syntax-class)) _n) nil)
874(cl-defmethod peg--detect-cycles (_path (_ (eql action)) _form) t)
875
876(defun peg-merge-errors (exps)
877 "Build a more readable error message out of failed expression."
878 (let ((merged '()))
879 (dolist (exp exps)
880 (setq merged (peg-merge-error exp merged)))
881 merged))
882
883(defun peg-merge-error (exp merged)
884 (apply #'peg--merge-error merged exp))
885
886(cl-defgeneric peg--merge-error (_merged head &rest args)
887 (error "No merge-error method for: %S" (cons head args)))
888
889(cl-defmethod peg--merge-error (merged (_ (eql or)) e1 e2)
890 (peg-merge-error e2 (peg-merge-error e1 merged)))
891
892(cl-defmethod peg--merge-error (merged (_ (eql and)) e1 _e2)
893 ;; FIXME: Why is `e2' not used?
894 (peg-merge-error e1 merged))
895
896(cl-defmethod peg--merge-error (merged (_ (eql str)) str)
897 ;;(add-to-list 'merged str)
898 (cl-adjoin str merged :test #'equal))
899
900(cl-defmethod peg--merge-error (merged (_ (eql call)) rule)
901 ;; (add-to-list 'merged rule)
902 (cl-adjoin rule merged :test #'equal))
903
904(cl-defmethod peg--merge-error (merged (_ (eql char)) char)
905 ;; (add-to-list 'merged (string char))
906 (cl-adjoin (string char) merged :test #'equal))
907
908(cl-defmethod peg--merge-error (merged (_ (eql set)) r c k)
909 ;; (add-to-list 'merged (peg-make-charset-regexp r c k))
910 (cl-adjoin (peg-make-charset-regexp r c k) merged :test #'equal))
911
912(cl-defmethod peg--merge-error (merged (_ (eql range)) from to)
913 ;; (add-to-list 'merged (format "[%c-%c]" from to))
914 (cl-adjoin (format "[%c-%c]" from to) merged :test #'equal))
915
916(cl-defmethod peg--merge-error (merged (_ (eql *)) exp)
917 (peg-merge-error exp merged))
918
919(cl-defmethod peg--merge-error (merged (_ (eql any)))
920 ;; (add-to-list 'merged '(any))
921 (cl-adjoin '(any) merged :test #'equal))
922
923(cl-defmethod peg--merge-error (merged (_ (eql not)) x)
924 ;; (add-to-list 'merged `(not ,x))
925 (cl-adjoin `(not ,x) merged :test #'equal))
926
927(cl-defmethod peg--merge-error (merged (_ (eql action)) _action) merged)
928(cl-defmethod peg--merge-error (merged (_ (eql null))) merged)
929
930(provide 'peg)
931(require 'peg)
932
933(define-peg-rule null () :inline t (guard t))
934(define-peg-rule fail () :inline t (guard nil))
935(define-peg-rule bob () :inline t (guard (bobp)))
936(define-peg-rule eob () :inline t (guard (eobp)))
937(define-peg-rule bol () :inline t (guard (bolp)))
938(define-peg-rule eol () :inline t (guard (eolp)))
939(define-peg-rule bow () :inline t (guard (looking-at "\\<")))
940(define-peg-rule eow () :inline t (guard (looking-at "\\>")))
941(define-peg-rule bos () :inline t (guard (looking-at "\\_<")))
942(define-peg-rule eos () :inline t (guard (looking-at "\\_>")))
943
944;;; peg.el ends here
diff --git a/test/lisp/peg-tests.el b/test/lisp/peg-tests.el
new file mode 100644
index 00000000000..864e09b4200
--- /dev/null
+++ b/test/lisp/peg-tests.el
@@ -0,0 +1,367 @@
1;;; peg-tests.el --- Tests of PEG parsers -*- lexical-binding: t; -*-
2
3;; Copyright (C) 2008-2023 Free Software Foundation, Inc.
4
5;; This program is free software; you can redistribute it and/or modify
6;; it under the terms of the GNU General Public License as published by
7;; the Free Software Foundation, either version 3 of the License, or
8;; (at your option) any later version.
9
10;; This program is distributed in the hope that it will be useful,
11;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13;; GNU General Public License for more details.
14
15;; You should have received a copy of the GNU General Public License
16;; along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18;;; Commentary:
19
20;; Tests and examples, that used to live in peg.el wrapped inside an `eval'.
21
22;;; Code:
23
24(require 'peg)
25(require 'ert)
26
27;;; Tests:
28
29(defmacro peg-parse-string (pex string &optional noerror)
30 "Parse STRING according to PEX.
31If NOERROR is non-nil, push nil resp. t if the parse failed
32resp. succeeded instead of signaling an error."
33 (let ((oldstyle (consp (car-safe pex)))) ;PEX is really a list of rules.
34 `(with-temp-buffer
35 (insert ,string)
36 (goto-char (point-min))
37 ,(if oldstyle
38 `(with-peg-rules ,pex
39 (peg-run (peg ,(caar pex))
40 ,(unless noerror '#'peg-signal-failure)))
41 `(peg-run (peg ,pex)
42 ,(unless noerror '#'peg-signal-failure))))))
43
44(define-peg-rule peg-test-natural ()
45 [0-9] (* [0-9]))
46
47(ert-deftest peg-test ()
48 (should (peg-parse-string peg-test-natural "99 bottles" t))
49 (should (peg-parse-string ((s "a")) "a" t))
50 (should (not (peg-parse-string ((s "a")) "b" t)))
51 (should (peg-parse-string ((s (not "a"))) "b" t))
52 (should (not (peg-parse-string ((s (not "a"))) "a" t)))
53 (should (peg-parse-string ((s (if "a"))) "a" t))
54 (should (not (peg-parse-string ((s (if "a"))) "b" t)))
55 (should (peg-parse-string ((s "ab")) "ab" t))
56 (should (not (peg-parse-string ((s "ab")) "ba" t)))
57 (should (not (peg-parse-string ((s "ab")) "a" t)))
58 (should (peg-parse-string ((s (range ?0 ?9))) "0" t))
59 (should (not (peg-parse-string ((s (range ?0 ?9))) "a" t)))
60 (should (peg-parse-string ((s [0-9])) "0" t))
61 (should (not (peg-parse-string ((s [0-9])) "a" t)))
62 (should (not (peg-parse-string ((s [0-9])) "" t)))
63 (should (peg-parse-string ((s (any))) "0" t))
64 (should (not (peg-parse-string ((s (any))) "" t)))
65 (should (peg-parse-string ((s (eob))) "" t))
66 (should (peg-parse-string ((s (not (eob)))) "a" t))
67 (should (peg-parse-string ((s (or "a" "b"))) "a" t))
68 (should (peg-parse-string ((s (or "a" "b"))) "b" t))
69 (should (not (peg-parse-string ((s (or "a" "b"))) "c" t)))
70 (should (peg-parse-string (and "a" "b") "ab" t))
71 (should (peg-parse-string ((s (and "a" "b"))) "abc" t))
72 (should (not (peg-parse-string (and "a" "b") "ba" t)))
73 (should (peg-parse-string ((s (and "a" "b" "c"))) "abc" t))
74 (should (peg-parse-string ((s (* "a") "b" (eob))) "b" t))
75 (should (peg-parse-string ((s (* "a") "b" (eob))) "ab" t))
76 (should (peg-parse-string ((s (* "a") "b" (eob))) "aaab" t))
77 (should (not (peg-parse-string ((s (* "a") "b" (eob))) "abc" t)))
78 (should (peg-parse-string ((s "")) "abc" t))
79 (should (peg-parse-string ((s "" (eob))) "" t))
80 (should (peg-parse-string ((s (opt "a") "b")) "abc" t))
81 (should (peg-parse-string ((s (opt "a") "b")) "bc" t))
82 (should (not (peg-parse-string ((s (or))) "ab" t)))
83 (should (peg-parse-string ((s (and))) "ab" t))
84 (should (peg-parse-string ((s (and))) "" t))
85 (should (peg-parse-string ((s ["^"])) "^" t))
86 (should (peg-parse-string ((s ["^a"])) "a" t))
87 (should (peg-parse-string ["-"] "-" t))
88 (should (peg-parse-string ((s ["]-"])) "]" t))
89 (should (peg-parse-string ((s ["^]"])) "^" t))
90 (should (peg-parse-string ((s [alpha])) "z" t))
91 (should (not (peg-parse-string ((s [alpha])) "0" t)))
92 (should (not (peg-parse-string ((s [alpha])) "" t)))
93 (should (not (peg-parse-string ((s ["][:alpha:]"])) "z" t)))
94 (should (peg-parse-string ((s (bob))) "" t))
95 (should (peg-parse-string ((s (bos))) "x" t))
96 (should (not (peg-parse-string ((s (bos))) " x" t)))
97 (should (peg-parse-string ((s "x" (eos))) "x" t))
98 (should (peg-parse-string ((s (syntax-class whitespace))) " " t))
99 (should (peg-parse-string ((s (= "foo"))) "foo" t))
100 (should (let ((f "foo")) (peg-parse-string ((s (= f))) "foo" t)))
101 (should (not (peg-parse-string ((s (= "foo"))) "xfoo" t)))
102 (should (equal (peg-parse-string ((s `(-- 1 2))) "") '(2 1)))
103 (should (equal (peg-parse-string ((s `(-- 1 2) `(a b -- a b))) "") '(2 1)))
104 (should (equal (peg-parse-string ((s (or (and (any) s)
105 (substring [0-9]))))
106 "ab0cd1ef2gh")
107 '("2")))
108 ;; The PEG rule `other' doesn't exist, which will cause a byte-compiler
109 ;; warning, but not an error at run time because the rule is not actually
110 ;; used in this particular case.
111 (should (equal (peg-parse-string ((s (substring (or "a" other)))
112 ;; Unused left-recursive rule, should
113 ;; cause a byte-compiler warning.
114 (r (* "a") r))
115 "af")
116 '("a")))
117 (should (equal (peg-parse-string ((s (list x y))
118 (x `(-- 1))
119 (y `(-- 2)))
120 "")
121 '((1 2))))
122 (should (equal (peg-parse-string ((s (list (* x)))
123 (x "" `(-- 'x)))
124 "xxx")
125 ;; The empty loop body should be matched once!
126 '((x))))
127 (should (equal (peg-parse-string ((s (list (* x)))
128 (x "x" `(-- 'x)))
129 "xxx")
130 '((x x x))))
131 (should (equal (peg-parse-string ((s (region (* x)))
132 (x "x" `(-- 'x)))
133 "xxx")
134 ;; FIXME: Since string positions start at 0, this should
135 ;; really be '(3 x x x 0) !!
136 '(4 x x x 1)))
137 (should (equal (peg-parse-string ((s (region (list (* x))))
138 (x "x" `(-- 'x 'y)))
139 "xxx")
140 '(4 (x y x y x y) 1)))
141 (should (equal (with-temp-buffer
142 (save-excursion (insert "abcdef"))
143 (list
144 (peg-run (peg "a"
145 (replace "bc" "x")
146 (replace "de" "y")
147 "f"))
148 (buffer-string)))
149 '(t "axyf")))
150 (with-temp-buffer
151 (insert "toro")
152 (goto-char (point-min))
153 (should (peg-run (peg "to")))
154 (should-not (peg-run (peg "to")))
155 (should (peg-run (peg "ro")))
156 (should (eobp)))
157 (with-temp-buffer
158 (insert " ")
159 (goto-char (point-min))
160 (peg-run (peg (+ (syntax-class whitespace))))
161 (should (eobp)))
162 )
163
164;;; Examples:
165
166;; peg-ex-recognize-int recognizes integers. An integer begins with a
167;; optional sign, then follows one or more digits. Digits are all
168;; characters from 0 to 9.
169;;
170;; Notes:
171;; 1) "" matches the empty sequence, i.e. matches without consuming
172;; input.
173;; 2) [0-9] is the character range from 0 to 9. This can also be
174;; written as (range ?0 ?9). Note that 0-9 is a symbol.
175(defun peg-ex-recognize-int ()
176 (with-peg-rules ((number sign digit (* digit))
177 (sign (or "+" "-" ""))
178 (digit [0-9]))
179 (peg-run (peg number))))
180
181;; peg-ex-parse-int recognizes integers and computes the corresponding
182;; value. The grammar is the same as for `peg-ex-recognize-int'
183;; augmented with parsing actions. Unfortunaletly, the actions add
184;; quite a bit of clutter.
185;;
186;; The actions for the sign rule push -1 on the stack for a minus sign
187;; and 1 for plus or no sign.
188;;
189;; The action for the digit rule pushes the value for a single digit.
190;;
191;; The action `(a b -- (+ (* a 10) b)), takes two items from the stack
192;; and pushes the first digit times 10 added to the second digit.
193;;
194;; The action `(sign val -- (* sign val)), multiplies val with the
195;; sign (1 or -1).
196(defun peg-ex-parse-int ()
197 (with-peg-rules ((number sign digit (* digit
198 `(a b -- (+ (* a 10) b)))
199 `(sign val -- (* sign val)))
200 (sign (or (and "+" `(-- 1))
201 (and "-" `(-- -1))
202 (and "" `(-- 1))))
203 (digit [0-9] `(-- (- (char-before) ?0))))
204 (peg-run (peg number))))
205
206;; Put point after the ) and press C-x C-e
207;; (peg-ex-parse-int)-234234
208
209;; Parse arithmetic expressions and compute the result as side effect.
210(defun peg-ex-arith ()
211 (peg-parse
212 (expr _ sum eol)
213 (sum product (* (or (and "+" _ product `(a b -- (+ a b)))
214 (and "-" _ product `(a b -- (- a b))))))
215 (product value (* (or (and "*" _ value `(a b -- (* a b)))
216 (and "/" _ value `(a b -- (/ a b))))))
217 (value (or (and (substring number) `(string -- (string-to-number string)))
218 (and "(" _ sum ")" _)))
219 (number (+ [0-9]) _)
220 (_ (* [" \t"]))
221 (eol (or "\n" "\r\n" "\r"))))
222
223;; (peg-ex-arith) 1 + 2 * 3 * (4 + 5)
224;; (peg-ex-arith) 1 + 2 ^ 3 * (4 + 5) ; fails to parse
225
226;; Parse URI according to RFC 2396.
227(defun peg-ex-uri ()
228 (peg-parse
229 (URI-reference (or absoluteURI relativeURI)
230 (or (and "#" (substring fragment))
231 `(-- nil))
232 `(scheme user host port path query fragment --
233 (list :scheme scheme :user user
234 :host host :port port
235 :path path :query query
236 :fragment fragment)))
237 (absoluteURI (substring scheme) ":" (or hier-part opaque-part))
238 (hier-part ;(-- user host port path query)
239 (or net-path
240 (and `(-- nil nil nil)
241 abs-path))
242 (or (and "?" (substring query))
243 `(-- nil)))
244 (net-path "//" authority (or abs-path `(-- nil)))
245 (abs-path "/" path-segments)
246 (path-segments segment (list (* "/" segment)) `(s l -- (cons s l)))
247 (segment (substring (* pchar) (* ";" param)))
248 (param (* pchar))
249 (pchar (or unreserved escaped [":@&=+$,"]))
250 (query (* uric))
251 (fragment (* uric))
252 (relativeURI (or net-path abs-path rel-path) (opt "?" query))
253 (rel-path rel-segment (opt abs-path))
254 (rel-segment (+ unreserved escaped [";@&=+$,"]))
255 (authority (or server reg-name))
256 (server (or (and (or (and (substring userinfo) "@")
257 `(-- nil))
258 hostport)
259 `(-- nil nil nil)))
260 (userinfo (* (or unreserved escaped [";:&=+$,"])))
261 (hostport (substring host) (or (and ":" (substring port))
262 `(-- nil)))
263 (host (or hostname ipv4address))
264 (hostname (* domainlabel ".") toplabel (opt "."))
265 (domainlabel alphanum
266 (opt (* (or alphanum "-") (if alphanum))
267 alphanum))
268 (toplabel alpha
269 (* (or alphanum "-") (if alphanum))
270 alphanum)
271 (ipv4address (+ digit) "." (+ digit) "." (+ digit) "." (+ digit))
272 (port (* digit))
273 (scheme alpha (* (or alpha digit ["+-."])))
274 (reg-name (or unreserved escaped ["$,;:@&=+"]))
275 (opaque-part uric-no-slash (* uric))
276 (uric (or reserved unreserved escaped))
277 (uric-no-slash (or unreserved escaped [";?:@&=+$,"]))
278 (reserved (set ";/?:@&=+$,"))
279 (unreserved (or alphanum mark))
280 (escaped "%" hex hex)
281 (hex (or digit [A-F] [a-f]))
282 (mark (set "-_.!~*'()"))
283 (alphanum (or alpha digit))
284 (alpha (or lowalpha upalpha))
285 (lowalpha [a-z])
286 (upalpha [A-Z])
287 (digit [0-9])))
288
289;; (peg-ex-uri)http://luser@www.foo.com:8080/bar/baz.html?x=1#foo
290;; (peg-ex-uri)file:/bar/baz.html?foo=df#x
291
292;; Split STRING where SEPARATOR occurs.
293(defun peg-ex-split (string separator)
294 (peg-parse-string ((s (list (* (* sep) elt)))
295 (elt (substring (+ (not sep) (any))))
296 (sep (= separator)))
297 string))
298
299;; (peg-ex-split "-abc-cd-" "-")
300
301;; Parse a lisp style Sexp.
302;; [To keep the example short, ' and . are handled as ordinary symbol.]
303(defun peg-ex-lisp ()
304 (peg-parse
305 (sexp _ (or string list number symbol))
306 (_ (* (or [" \n\t"] comment)))
307 (comment ";" (* (not (or "\n" (eob))) (any)))
308 (string "\"" (substring (* (not "\"") (any))) "\"")
309 (number (substring (opt (set "+-")) (+ digit))
310 (if terminating)
311 `(string -- (string-to-number string)))
312 (symbol (substring (and symchar (* (not terminating) symchar)))
313 `(s -- (intern s)))
314 (symchar [a-z A-Z 0-9 "-;!#%&'*+,./:;<=>?@[]^_`{|}~"])
315 (list "(" `(-- (cons nil nil)) `(hd -- hd hd)
316 (* sexp `(tl e -- (setcdr tl (list e))))
317 _ ")" `(hd _tl -- (cdr hd)))
318 (digit [0-9])
319 (terminating (or (set " \n\t();\"'") (eob)))))
320
321;; (peg-ex-lisp)
322
323;; We try to detect left recursion and report it as error.
324(defun peg-ex-left-recursion ()
325 (eval '(peg-parse (exp (or term
326 (and exp "+" exp)))
327 (term (or digit
328 (and term "*" term)))
329 (digit [0-9]))
330 t))
331
332(defun peg-ex-infinite-loop ()
333 (eval '(peg-parse (exp (* (or "x"
334 "y"
335 (action (foo))))))
336 t))
337
338;; Some efficiency problems:
339
340;; Find the last digit in a string.
341;; Recursive definition with excessive stack usage.
342(defun peg-ex-last-digit (string)
343 (peg-parse-string ((s (or (and (any) s)
344 (substring [0-9]))))
345 string))
346
347;; (peg-ex-last-digit "ab0cd1ef2gh")
348;; (peg-ex-last-digit (make-string 50 ?-))
349;; (peg-ex-last-digit (make-string 1000 ?-))
350
351;; Find the last digit without recursion. Doesn't run out of stack,
352;; but probably still too inefficient for large inputs.
353(defun peg-ex-last-digit2 (string)
354 (peg-parse-string ((s `(-- nil)
355 (+ (* (not digit) (any))
356 (substring digit)
357 `(_d1 d2 -- d2)))
358 (digit [0-9]))
359 string))
360
361;; (peg-ex-last-digit2 "ab0cd1ef2gh")
362;; (peg-ex-last-digit2 (concat (make-string 500000 ?-) "8a9b"))
363;; (peg-ex-last-digit2 (make-string 500000 ?-))
364;; (peg-ex-last-digit2 (make-string 500000 ?5))
365
366(provide 'peg-tests)
367;;; peg-tests.el ends here