aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChong Yidong2012-07-05 00:14:05 +0800
committerChong Yidong2012-07-05 00:14:05 +0800
commit566df3fcac8010303c1d8b8558cb07f3a057b346 (patch)
treec03584ee0936855ce95cb9e84c241cc016c095f5
parent0781098af7c8da77b1d044dce151e6a130eb1e77 (diff)
downloademacs-566df3fcac8010303c1d8b8558cb07f3a057b346.tar.gz
emacs-566df3fcac8010303c1d8b8558cb07f3a057b346.zip
Clean up syntax-table usage in xml.el
* xml.el (xml--parse-buffer): Use xml-syntax-table. (xml-parse-tag): Likewise, and avoid changing entity tables. (xml-syntax-table): Define from scratch, making sure not to give x2000 and other Unicode spaces whitespace syntax, since those are not spaces in XML. (xml-parse-fragment): Delete unused function. (xml-name-start-char-re, xml-name-char-re, xml-name-re) (xml-names-re, xml-nmtoken-re, xml-nmtokens-re, xml-char-ref-re) (xml-entity-ref, xml-pe-reference-re) (xml-reference-re,xml-att-value-re, xml-tokenized-type-re) (xml-notation-type-re, xml-enumeration-re, xml-enumerated-type-re) (xml-att-type-re, xml-default-decl-re, xml-att-def-re) (xml-entity-value-re): Use syntax references in regexps where possible; no need to define inside a let-binding. (xml-parse-dtd): Use xml-pe-reference-re. (xml-entity-or-char-ref-re): New defconst. (xml-parse-string, xml-substitute-special): Use it.
-rw-r--r--lisp/ChangeLog20
-rw-r--r--lisp/xml.el377
-rw-r--r--test/automated/xml-parse-tests.el16
3 files changed, 222 insertions, 191 deletions
diff --git a/lisp/ChangeLog b/lisp/ChangeLog
index 0a486daa809..8cef65cb10c 100644
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@@ -1,3 +1,23 @@
12012-07-04 Chong Yidong <cyd@gnu.org>
2
3 * xml.el (xml--parse-buffer): Use xml-syntax-table.
4 (xml-parse-tag): Likewise, and avoid changing entity tables.
5 (xml-syntax-table): Define from scratch, making sure not to give
6 x2000 and other Unicode spaces whitespace syntax, since those are
7 not spaces in XML.
8 (xml-parse-fragment): Delete unused function.
9 (xml-name-start-char-re, xml-name-char-re, xml-name-re)
10 (xml-names-re, xml-nmtoken-re, xml-nmtokens-re, xml-char-ref-re)
11 (xml-entity-ref, xml-pe-reference-re)
12 (xml-reference-re,xml-att-value-re, xml-tokenized-type-re)
13 (xml-notation-type-re, xml-enumeration-re, xml-enumerated-type-re)
14 (xml-att-type-re, xml-default-decl-re, xml-att-def-re)
15 (xml-entity-value-re): Use syntax references in regexps where
16 possible; no need to define inside a let-binding.
17 (xml-parse-dtd): Use xml-pe-reference-re.
18 (xml-entity-or-char-ref-re): New defconst.
19 (xml-parse-string, xml-substitute-special): Use it.
20
12012-07-04 Stefan Monnier <monnier@iro.umontreal.ca> 212012-07-04 Stefan Monnier <monnier@iro.umontreal.ca>
2 22
3 * files.el (locate-dominating-file): Allow `name' to be a predicate. 23 * files.el (locate-dominating-file): Allow `name' to be a predicate.
diff --git a/lisp/xml.el b/lisp/xml.el
index f2c1a703f88..e2788e5e756 100644
--- a/lisp/xml.el
+++ b/lisp/xml.el
@@ -164,93 +164,107 @@ An empty string is returned if the attribute was not found.
164See also `xml-get-attribute-or-nil'." 164See also `xml-get-attribute-or-nil'."
165 (or (xml-get-attribute-or-nil node attribute) "")) 165 (or (xml-get-attribute-or-nil node attribute) ""))
166 166
167;;; Creating the list 167;;; Regular expressions for XML components
168
169;;;###autoload
170(defun xml-parse-file (file &optional parse-dtd parse-ns)
171 "Parse the well-formed XML file FILE.
172Return the top node with all its children.
173If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
174If PARSE-NS is non-nil, then QNAMES are expanded."
175 (with-temp-buffer
176 (insert-file-contents file)
177 (xml--parse-buffer parse-dtd parse-ns)))
178 168
169;; The following regexps are used as subexpressions in regexps that
170;; are `eval-when-compile'd for efficiency, so they must be defined at
171;; compile time.
179(eval-and-compile 172(eval-and-compile
180(let* ((start-chars (concat "[:alpha:]:_")) 173
181 (name-chars (concat "-[:digit:]." start-chars)) 174;; [4] NameStartChar
182 ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+ 175;; See the definition of word syntax in `xml-syntax-table'.
183 (whitespace "[ \t\n\r]")) 176(defconst xml-name-start-char-re (concat "[[:word:]:_]"))
184 ;; [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] 177
185 ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] 178;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7
186 ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] 179;; | [#x0300-#x036F] | [#x203F-#x2040]
187 ;; | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] 180(defconst xml-name-char-re (concat "[-0-9.[:word:]:_·̀-ͯ‿-⁀]"))
188 ;; | [#x10000-#xEFFFF] 181
189 (defconst xml-name-start-char-re (concat "[" start-chars "]")) 182;; [5] Name ::= NameStartChar (NameChar)*
190 ;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 183(defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
191 ;; | [#x0300-#x036F] | [#x203F-#x2040] 184
192 (defconst xml-name-char-re (concat "[" name-chars "]")) 185;; [6] Names ::= Name (#x20 Name)*
193 ;; [5] Name ::= NameStartChar (NameChar)* 186(defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
194 (defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*")) 187
195 ;; [6] Names ::= Name (#x20 Name)* 188;; [7] Nmtoken ::= (NameChar)+
196 (defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*")) 189(defconst xml-nmtoken-re (concat xml-name-char-re "+"))
197 ;; [7] Nmtoken ::= (NameChar)+ 190
198 (defconst xml-nmtoken-re (concat xml-name-char-re "+")) 191;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
199 ;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* 192(defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
200 (defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*")) 193
201 ;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 194;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
202 (defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)") 195(defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
203 ;; [68] EntityRef ::= '&' Name ';' 196
204 (defconst xml-entity-ref (concat "&" xml-name-re ";")) 197;; [68] EntityRef ::= '&' Name ';'
205 ;; [69] PEReference ::= '%' Name ';' 198(defconst xml-entity-ref (concat "&" xml-name-re ";"))
206 (defconst xml-pe-reference-re (concat "%" xml-name-re ";")) 199
207 ;; [67] Reference ::= EntityRef | CharRef 200(defconst xml-entity-or-char-ref-re (concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\("
208 (defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)")) 201 xml-name-re "\\)\\);"))
209 ;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" 202
210 (defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|" 203;; [69] PEReference ::= '%' Name ';'
211 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)")) 204(defconst xml-pe-reference-re (concat "%\\(" xml-name-re "\\);"))
212 ;; [56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default] 205
213 ;; | 'IDREF' [VC: IDREF] 206;; [67] Reference ::= EntityRef | CharRef
214 ;; | 'IDREFS' [VC: IDREF] 207(defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
215 ;; | 'ENTITY' [VC: Entity Name] 208
216 ;; | 'ENTITIES' [VC: Entity Name] 209;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
217 ;; | 'NMTOKEN' [VC: Name Token] 210;; | "'" ([^<&'] | Reference)* "'"
218 ;; | 'NMTOKENS' [VC: Name Token] 211(defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|"
219 (defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|" 212 xml-reference-re "\\)*\"\\|"
220 "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")) 213 "'\\(?:[^&']\\|" xml-reference-re
221 ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' 214 "\\)*'\\)"))
222 (defconst xml-notation-type-re 215
223 (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re 216;; [56] TokenizedType ::= 'ID'
224 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" 217;; [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default]
225 whitespace "*)\\)")) 218;; | 'IDREF' [VC: IDREF]
226 ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 219;; | 'IDREFS' [VC: IDREF]
227 ;; [VC: Enumeration] [VC: No Duplicate Tokens] 220;; | 'ENTITY' [VC: Entity Name]
228 (defconst xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re 221;; | 'ENTITIES' [VC: Entity Name]
229 "\\(?:" whitespace "*|" whitespace "*" 222;; | 'NMTOKEN' [VC: Name Token]
230 xml-nmtoken-re "\\)*" 223;; | 'NMTOKENS' [VC: Name Token]
231 whitespace ")\\)")) 224(defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|"
232 ;; [57] EnumeratedType ::= NotationType | Enumeration 225 "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)"))
233 (defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re 226
234 "\\|" xml-enumeration-re "\\)")) 227;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
235 ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType 228(defconst xml-notation-type-re
236 ;; [55] StringType ::= 'CDATA' 229 (concat "\\(?:NOTATION\\s-+(\\s-*" xml-name-re
237 (defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re 230 "\\(?:\\s-*|\\s-*" xml-name-re "\\)*\\s-*)\\)"))
238 "\\|" xml-notation-type-re 231
239 "\\|" xml-enumerated-type-re "\\)")) 232;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
240 ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) 233;; [VC: Enumeration] [VC: No Duplicate Tokens]
241 (defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" 234(defconst xml-enumeration-re (concat "\\(?:(\\s-*" xml-nmtoken-re
242 whitespace "\\)*" xml-att-value-re "\\)")) 235 "\\(?:\\s-*|\\s-*" xml-nmtoken-re
243 ;; [53] AttDef ::= S Name S AttType S DefaultDecl 236 "\\)*\\s-+)\\)"))
244 (defconst xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re 237
245 whitespace "*" xml-att-type-re 238;; [57] EnumeratedType ::= NotationType | Enumeration
246 whitespace "*" xml-default-decl-re "\\)")) 239(defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re
247 ;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' 240 "\\|" xml-enumeration-re "\\)"))
248 ;; | "'" ([^%&'] | PEReference | Reference)* "'" 241
249 (defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re 242;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
250 "\\|" xml-reference-re 243;; [55] StringType ::= 'CDATA'
251 "\\)*\"\\|'\\(?:[^%&']\\|" 244(defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re
252 xml-pe-reference-re "\\|" 245 "\\|" xml-notation-type-re
253 xml-reference-re "\\)*'\\)")))) 246 "\\|" xml-enumerated-type-re "\\)"))
247
248;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
249(defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|"
250 "\\(?:#FIXED\\s-+\\)*"
251 xml-att-value-re "\\)"))
252
253;; [53] AttDef ::= S Name S AttType S DefaultDecl
254(defconst xml-att-def-re (concat "\\(?:\\s-*" xml-name-re
255 "\\s-*" xml-att-type-re
256 "\\s-*" xml-default-decl-re "\\)"))
257
258;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
259;; | "'" ([^%&'] | PEReference | Reference)* "'"
260(defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|"
261 xml-pe-reference-re
262 "\\|" xml-reference-re
263 "\\)*\"\\|'\\(?:[^%&']\\|"
264 xml-pe-reference-re "\\|"
265 xml-reference-re "\\)*'\\)"))
266) ; End of `eval-when-compile'
267
254 268
255;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral 269;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
256;; | 'PUBLIC' S PubidLiteral S SystemLiteral 270;; | 'PUBLIC' S PubidLiteral S SystemLiteral
@@ -263,53 +277,59 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
263 277
264;; Note that this is setup so that we can do whitespace-skipping with 278;; Note that this is setup so that we can do whitespace-skipping with
265;; `(skip-syntax-forward " ")', inter alia. Previously this was slow 279;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
266;; compared with `re-search-forward', but that has been fixed. Also 280;; compared with `re-search-forward', but that has been fixed.
267;; note that the standard syntax table contains other characters with
268;; whitespace syntax, like NBSP, but they are invalid in contexts in
269;; which we might skip whitespace -- specifically, they're not
270;; NameChars [XML 4].
271 281
272(defvar xml-syntax-table 282(defvar xml-syntax-table
273 (let ((table (make-syntax-table))) 283 ;; By default, characters have symbol syntax.
274 ;; Get space syntax correct per XML [3]. 284 (let ((table (make-char-table 'syntax-table '(3))))
275 (dotimes (c 31) 285 ;; The XML space chars [3], and nothing else, have space syntax.
276 (modify-syntax-entry c "." table)) ; all are space in standard table 286 (dolist (c '(?\s ?\t ?\r ?\n))
277 (dolist (c '(?\t ?\n ?\r)) ; these should be space
278 (modify-syntax-entry c " " table)) 287 (modify-syntax-entry c " " table))
279 ;; For skipping attributes. 288 ;; The characters in NameStartChar [4], aside from ':' and '_',
280 (modify-syntax-entry ?\" "\"" table) 289 ;; have word syntax. This is used by `xml-name-start-char-re'.
281 (modify-syntax-entry ?' "\"" table) 290 (modify-syntax-entry '(?A . ?Z) "w" table)
282 ;; Non-alnum name chars should be symbol constituents (`-' and `_' 291 (modify-syntax-entry '(?a . ?z) "w" table)
283 ;; are OK by default). 292 (modify-syntax-entry '(#xC0 . #xD6) "w" table)
284 (modify-syntax-entry ?. "_" table) 293 (modify-syntax-entry '(#xD8 . #XF6) "w" table)
285 (modify-syntax-entry ?: "_" table) 294 (modify-syntax-entry '(#xF8 . #X2FF) "w" table)
286 ;; XML [89] 295 (modify-syntax-entry '(#x370 . #X37D) "w" table)
287 (unless (featurep 'xemacs) 296 (modify-syntax-entry '(#x37F . #x1FFF) "w" table)
288 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005 297 (modify-syntax-entry '(#x200C . #x200D) "w" table)
289 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC 298 (modify-syntax-entry '(#x2070 . #x218F) "w" table)
290 #x30FD #x30FE)) 299 (modify-syntax-entry '(#x2C00 . #x2FEF) "w" table)
291 (modify-syntax-entry (decode-char 'ucs c) "w" table))) 300 (modify-syntax-entry '(#x3001 . #xD7FF) "w" table)
292 ;; Fixme: rest of [4] 301 (modify-syntax-entry '(#xF900 . #xFDCF) "w" table)
302 (modify-syntax-entry '(#xFDF0 . #xFFFD) "w" table)
303 (modify-syntax-entry '(#x10000 . #xEFFFF) "w" table)
293 table) 304 table)
294 "Syntax table used by `xml-parse-region'.") 305 "Syntax table used by the XML parser.
306In this syntax table, the XML space characters [ \\t\\r\\n], and
307only those characters, have whitespace syntax.")
295 308
296;; XML [5] 309;;; Entry points:
297 310
298;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e. 311;;;###autoload
299;; document ::= prolog element Misc* 312(defun xml-parse-file (file &optional parse-dtd parse-ns)
300;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? 313 "Parse the well-formed XML file FILE.
314Return the top node with all its children.
315If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
316If PARSE-NS is non-nil, then QNAMES are expanded."
317 (with-temp-buffer
318 (insert-file-contents file)
319 (xml--parse-buffer parse-dtd parse-ns)))
301 320
302;;;###autoload 321;;;###autoload
303(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns) 322(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns)
304 "Parse the region from BEG to END in BUFFER. 323 "Parse the region from BEG to END in BUFFER.
324Return the XML parse tree, or raise an error if the region does
325not contain well-formed XML.
326
305If BEG is nil, it defaults to `point-min'. 327If BEG is nil, it defaults to `point-min'.
306If END is nil, it defaults to `point-max'. 328If END is nil, it defaults to `point-max'.
307If BUFFER is nil, it defaults to the current buffer. 329If BUFFER is nil, it defaults to the current buffer.
308Returns the XML list for the region, or raises an error if the region 330If PARSE-DTD is non-nil, parse the DTD and return it as the first
309is not well-formed XML. 331element of the list.
310If PARSE-DTD is non-nil, the DTD is parsed rather than skipped, 332If PARSE-NS is non-nil, expand QNAMES."
311and returned as the first element of the list.
312If PARSE-NS is non-nil, then QNAMES are expanded."
313 ;; Use fixed syntax table to ensure regexp char classes and syntax 333 ;; Use fixed syntax table to ensure regexp char classes and syntax
314 ;; specs DTRT. 334 ;; specs DTRT.
315 (unless buffer 335 (unless buffer
@@ -318,8 +338,14 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
318 (insert-buffer-substring-no-properties buffer beg end) 338 (insert-buffer-substring-no-properties buffer beg end)
319 (xml--parse-buffer parse-dtd parse-ns))) 339 (xml--parse-buffer parse-dtd parse-ns)))
320 340
341;; XML [5]
342
343;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
344;; document ::= prolog element Misc*
345;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
346
321(defun xml--parse-buffer (parse-dtd parse-ns) 347(defun xml--parse-buffer (parse-dtd parse-ns)
322 (with-syntax-table (standard-syntax-table) 348 (with-syntax-table xml-syntax-table
323 (let ((case-fold-search nil) ; XML is case-sensitive. 349 (let ((case-fold-search nil) ; XML is case-sensitive.
324 ;; Prevent entity definitions from changing the defaults 350 ;; Prevent entity definitions from changing the defaults
325 (xml-entity-alist xml-entity-alist) 351 (xml-entity-alist xml-entity-alist)
@@ -374,22 +400,6 @@ specify that the name shouldn't be given a namespace."
374 (cons ns (if special "" lname))) 400 (cons ns (if special "" lname)))
375 (intern name))) 401 (intern name)))
376 402
377(defun xml-parse-fragment (&optional parse-dtd parse-ns)
378 "Parse xml-like fragments."
379 (let ((xml-sub-parser t)
380 ;; Prevent entity definitions from changing the defaults
381 (xml-entity-alist xml-entity-alist)
382 (xml-parameter-entity-alist xml-parameter-entity-alist)
383 children)
384 (while (not (eobp))
385 (let ((bit (xml-parse-tag-1 parse-dtd parse-ns)))
386 (if children
387 (setq children (append (list bit) children))
388 (if (stringp bit)
389 (setq children (list bit))
390 (setq children bit)))))
391 (reverse children)))
392
393(defun xml-parse-tag (&optional parse-dtd parse-ns) 403(defun xml-parse-tag (&optional parse-dtd parse-ns)
394 "Parse the tag at point. 404 "Parse the tag at point.
395If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and 405If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
@@ -401,12 +411,17 @@ Return one of:
401 - a list : the matching node 411 - a list : the matching node
402 - nil : the point is not looking at a tag. 412 - nil : the point is not looking at a tag.
403 - a pair : the first element is the DTD, the second is the node." 413 - a pair : the first element is the DTD, the second is the node."
404 (let ((buf (current-buffer)) 414 (let* ((case-fold-search nil)
405 (pos (point))) 415 ;; Prevent entity definitions from changing the defaults
416 (xml-entity-alist xml-entity-alist)
417 (xml-parameter-entity-alist xml-parameter-entity-alist)
418 (buf (current-buffer))
419 (pos (point)))
406 (with-temp-buffer 420 (with-temp-buffer
407 (insert-buffer-substring-no-properties buf pos) 421 (with-syntax-table xml-syntax-table
408 (goto-char (point-min)) 422 (insert-buffer-substring-no-properties buf pos)
409 (xml-parse-tag-1 parse-dtd parse-ns)))) 423 (goto-char (point-min))
424 (xml-parse-tag-1 parse-dtd parse-ns)))))
410 425
411(defun xml-parse-tag-1 (&optional parse-dtd parse-ns) 426(defun xml-parse-tag-1 (&optional parse-dtd parse-ns)
412 "Like `xml-parse-tag', but possibly modify the buffer while working." 427 "Like `xml-parse-tag', but possibly modify the buffer while working."
@@ -530,40 +545,32 @@ references."
530 (skip-chars-forward "^<&") 545 (skip-chars-forward "^<&")
531 (when (eq (char-after) ?&) 546 (when (eq (char-after) ?&)
532 ;; If we find an entity or character reference, expand it. 547 ;; If we find an entity or character reference, expand it.
533 (unless (looking-at (eval-when-compile 548 (unless (looking-at xml-entity-or-char-ref-re)
534 (concat "&\\(?:#\\([0-9]+\\)\\|#x\\([0-9a-fA-F]+\\)\\|\\("
535 xml-name-re "\\)\\);")))
536 (error "XML: (Not Well-Formed) Invalid entity reference")) 549 (error "XML: (Not Well-Formed) Invalid entity reference"))
537 ;; For a character reference, the next entity or character 550 ;; For a character reference, the next entity or character
538 ;; reference must be after the replacement. [4.6] "Numerical 551 ;; reference must be after the replacement. [4.6] "Numerical
539 ;; character references are expanded immediately when 552 ;; character references are expanded immediately when
540 ;; recognized and MUST be treated as character data." 553 ;; recognized and MUST be treated as character data."
541 (cond ((setq ref (match-string 1)) 554 (if (setq ref (match-string 2))
542 ;; Decimal character reference 555 (progn ; Numeric char reference
543 (setq val (save-match-data 556 (setq val (save-match-data
544 (decode-char 'ucs (string-to-number ref)))) 557 (decode-char 'ucs (string-to-number
545 (and (null val) 558 ref (if (match-string 1) 16)))))
546 xml-validating-parser 559 (and (null val)
547 (error "XML: (Validity) Invalid character `%s'" ref)) 560 xml-validating-parser
548 (replace-match (or (string val) xml-undefined-entity) t t)) 561 (error "XML: (Validity) Invalid character reference `%s'"
549 ;; Hexadecimal character reference 562 (match-string 0)))
550 ((setq ref (match-string 2)) 563 (replace-match (or (string val) xml-undefined-entity) t t))
551 (setq val (save-match-data 564 ;; For an entity reference, search again from the start of
552 (decode-char 'ucs (string-to-number ref 16)))) 565 ;; the replaced text, since the replacement can contain
553 (and (null val) 566 ;; entity or character references, or markup.
554 xml-validating-parser 567 (setq ref (match-string 3)
555 (error "XML: (Validity) Invalid character `x%s'" ref)) 568 val (assoc ref xml-entity-alist))
556 (replace-match (or (string val) xml-undefined-entity) t t)) 569 (and (null val)
557 ;; For an entity reference, search again from the start 570 xml-validating-parser
558 ;; of the replaced text, since the replacement can 571 (error "XML: (Validity) Undefined entity `%s'" ref))
559 ;; contain entity or character references, or markup. 572 (replace-match (cdr val) t t)
560 ((setq ref (match-string 3)) 573 (goto-char (match-beginning 0)))
561 (setq val (assoc ref xml-entity-alist))
562 (and (null val)
563 xml-validating-parser
564 (error "XML: (Validity) Undefined entity `%s'" ref))
565 (replace-match (cdr val) t t)
566 (goto-char (match-beginning 0))))
567 ;; Check for XML bombs. 574 ;; Check for XML bombs.
568 (and xml-entity-expansion-limit 575 (and xml-entity-expansion-limit
569 (> (- (buffer-size) (point)) 576 (> (- (buffer-size) (point))
@@ -610,8 +617,9 @@ Leave point at the first non-blank character after the tag."
610 (replace-regexp-in-string "\\s-\\{2,\\}" " " string) 617 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
611 (let ((expansion (xml-substitute-special string))) 618 (let ((expansion (xml-substitute-special string)))
612 (unless (stringp expansion) 619 (unless (stringp expansion)
613 ; We say this is the constraint. It is actually that neither 620 ;; We say this is the constraint. It is actually that
614 ; external entities nor "<" can be in an attribute value. 621 ;; neither external entities nor "<" can be in an
622 ;; attribute value.
615 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements")) 623 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
616 (push (cons name expansion) attlist))) 624 (push (cons name expansion) attlist)))
617 625
@@ -643,8 +651,6 @@ This follows the rule [28] in the XML specifications."
643 (looking-at xml-name-re) 651 (looking-at xml-name-re)
644 (let ((dtd (list (match-string-no-properties 0) 'dtd)) 652 (let ((dtd (list (match-string-no-properties 0) 'dtd))
645 (xml-parameter-entity-alist xml-parameter-entity-alist) 653 (xml-parameter-entity-alist xml-parameter-entity-alist)
646 (parameter-entity-re (eval-when-compile
647 (concat "%\\(" xml-name-re "\\);")))
648 next-parameter-entity) 654 next-parameter-entity)
649 (goto-char (match-end 0)) 655 (goto-char (match-end 0))
650 (skip-syntax-forward " ") 656 (skip-syntax-forward " ")
@@ -693,7 +699,7 @@ This follows the rule [28] in the XML specifications."
693 ;; and try again. 699 ;; and try again.
694 (setq next-parameter-entity 700 (setq next-parameter-entity
695 (save-excursion 701 (save-excursion
696 (if (re-search-forward parameter-entity-re nil t) 702 (if (re-search-forward xml-pe-reference-re nil t)
697 (match-beginning 0)))) 703 (match-beginning 0))))
698 704
699 ;; Parse the rest of the DTD 705 ;; Parse the rest of the DTD
@@ -752,7 +758,7 @@ This follows the rule [28] in the XML specifications."
752 (> (point) next-parameter-entity) 758 (> (point) next-parameter-entity)
753 (setq next-parameter-entity 759 (setq next-parameter-entity
754 (save-excursion 760 (save-excursion
755 (if (re-search-forward parameter-entity-re nil t) 761 (if (re-search-forward xml-pe-reference-re nil t)
756 (match-beginning 0)))))) 762 (match-beginning 0))))))
757 763
758 ;; Internal entity declarations: 764 ;; Internal entity declarations:
@@ -796,7 +802,7 @@ This follows the rule [28] in the XML specifications."
796 (next-parameter-entity 802 (next-parameter-entity
797 (save-excursion 803 (save-excursion
798 (goto-char next-parameter-entity) 804 (goto-char next-parameter-entity)
799 (unless (looking-at parameter-entity-re) 805 (unless (looking-at xml-pe-reference-re)
800 (error "XML: Internal error")) 806 (error "XML: Internal error"))
801 (let* ((entity (match-string 1)) 807 (let* ((entity (match-string 1))
802 (beg (point-marker)) 808 (beg (point-marker))
@@ -808,7 +814,7 @@ This follows the rule [28] in the XML specifications."
808 (goto-char next-parameter-entity)) 814 (goto-char next-parameter-entity))
809 (goto-char (match-end 0)))) 815 (goto-char (match-end 0))))
810 (setq next-parameter-entity 816 (setq next-parameter-entity
811 (if (re-search-forward parameter-entity-re nil t) 817 (if (re-search-forward xml-pe-reference-re nil t)
812 (match-beginning 0))))) 818 (match-beginning 0)))))
813 819
814 ;; Anything else is garbage (ignored if not validating). 820 ;; Anything else is garbage (ignored if not validating).
@@ -889,20 +895,17 @@ references and parameter-entity references."
889(defun xml-substitute-special (string) 895(defun xml-substitute-special (string)
890 "Return STRING, after substituting entity and character references. 896 "Return STRING, after substituting entity and character references.
891STRING is assumed to occur in an XML attribute value." 897STRING is assumed to occur in an XML attribute value."
892 (let ((ref-re (eval-when-compile 898 (let ((strlen (length string))
893 (concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\("
894 xml-name-re "\\)\\);")))
895 (strlen (length string))
896 children) 899 children)
897 (while (string-match ref-re string) 900 (while (string-match xml-entity-or-char-ref-re string)
898 (push (substring string 0 (match-beginning 0)) children) 901 (push (substring string 0 (match-beginning 0)) children)
899 (let* ((remainder (substring string (match-end 0))) 902 (let* ((remainder (substring string (match-end 0)))
900 (ref (match-string 2 string))) 903 (is-hex (match-string 1 string)) ; Is it a hex numeric reference?
904 (ref (match-string 2 string))) ; Numeric part of reference
901 (if ref 905 (if ref
902 ;; [4.6] Character references are included as 906 ;; [4.6] Character references are included as
903 ;; character data. 907 ;; character data.
904 (let ((val (decode-char 'ucs (string-to-number 908 (let ((val (decode-char 'ucs (string-to-number ref (if is-hex 16)))))
905 ref (if (match-string 1 string) 16)))))
906 (push (cond (val (string val)) 909 (push (cond (val (string val))
907 (xml-validating-parser 910 (xml-validating-parser
908 (error "XML: (Validity) Undefined character `x%s'" ref)) 911 (error "XML: (Validity) Undefined character `x%s'" ref))
@@ -913,7 +916,7 @@ STRING is assumed to occur in an XML attribute value."
913 ;; [4.4.5] Entity references are "included in literal". 916 ;; [4.4.5] Entity references are "included in literal".
914 ;; Note that we don't need do anything special to treat 917 ;; Note that we don't need do anything special to treat
915 ;; quotes as normal data characters. 918 ;; quotes as normal data characters.
916 (setq ref (match-string 3 string)) 919 (setq ref (match-string 3 string)) ; entity name
917 (let ((val (or (cdr (assoc ref xml-entity-alist)) 920 (let ((val (or (cdr (assoc ref xml-entity-alist))
918 (if xml-validating-parser 921 (if xml-validating-parser
919 (error "XML: (Validity) Undefined entity `%s'" ref) 922 (error "XML: (Validity) Undefined entity `%s'" ref)
diff --git a/test/automated/xml-parse-tests.el b/test/automated/xml-parse-tests.el
index ada9bbd4074..e6553060345 100644
--- a/test/automated/xml-parse-tests.el
+++ b/test/automated/xml-parse-tests.el
@@ -30,10 +30,10 @@
30(require 'xml) 30(require 'xml)
31 31
32(defvar xml-parse-tests--data 32(defvar xml-parse-tests--data
33 '(;; General entity substitution 33 `(;; General entity substitution
34 ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY ent \"AbC\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" . 34 ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY ent \"AbC\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" .
35 ((foo ((a . "b")) (bar nil "AbC;")))) 35 ((foo ((a . "b")) (bar nil "AbC;"))))
36 ("<?xml version=\"1.0\"?><foo>&amp;amp;&#38;apos;&apos;&lt;&gt;&quot;</foo>" . 36 ("<?xml version=\"1.0\"?><foo>&amp;amp;&#x26;apos;&apos;&lt;&gt;&quot;</foo>" .
37 ((foo () "&amp;&apos;'<>\""))) 37 ((foo () "&amp;&apos;'<>\"")))
38 ;; Parameter entity substitution 38 ;; Parameter entity substitution
39 ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY % pent \"AbC\"><!ENTITY ent \"%pent;\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" . 39 ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY % pent \"AbC\"><!ENTITY ent \"%pent;\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" .
@@ -52,7 +52,11 @@
52 ((foo ((a . "-aBc-")) "1"))) 52 ((foo ((a . "-aBc-")) "1")))
53 ;; Character references must be treated as character data 53 ;; Character references must be treated as character data
54 ("<foo>AT&amp;T;</foo>" . ((foo () "AT&T;"))) 54 ("<foo>AT&amp;T;</foo>" . ((foo () "AT&T;")))
55 ("<foo>&#38;amp;</foo>" . ((foo () "&amp;")))) 55 ("<foo>&#38;amp;</foo>" . ((foo () "&amp;")))
56 ("<foo>&#x26;amp;</foo>" . ((foo () "&amp;")))
57 ;; Unusual but valid XML names [5]
58 ("<ÀÖØö.3·-‿⁀󯿿>abc</ÀÖØö.3·-‿⁀󯿿>" . ((,(intern "ÀÖØö.3·-‿⁀󯿿") () "abc")))
59 ("<:>abc</:>" . ((,(intern ":") () "abc"))))
56 "Alist of XML strings and their expected parse trees.") 60 "Alist of XML strings and their expected parse trees.")
57 61
58(defvar xml-parse-tests--bad-data 62(defvar xml-parse-tests--bad-data
@@ -63,7 +67,11 @@
63 ;; Non-terminating DTD 67 ;; Non-terminating DTD
64 "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">" 68 "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">"
65 "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf" 69 "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf"
66 "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf&abc;") 70 "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf&abc;"
71 ;; Invalid XML names
72 "<0foo>abc</0foo>"
73 "<‿foo>abc</‿foo>"
74 "<f¿>abc</f¿>")
67 "List of XML strings that should signal an error in the parser") 75 "List of XML strings that should signal an error in the parser")
68 76
69(ert-deftest xml-parse-tests () 77(ert-deftest xml-parse-tests ()