diff options
| author | Chong Yidong | 2012-07-05 00:14:05 +0800 |
|---|---|---|
| committer | Chong Yidong | 2012-07-05 00:14:05 +0800 |
| commit | 566df3fcac8010303c1d8b8558cb07f3a057b346 (patch) | |
| tree | c03584ee0936855ce95cb9e84c241cc016c095f5 | |
| parent | 0781098af7c8da77b1d044dce151e6a130eb1e77 (diff) | |
| download | emacs-566df3fcac8010303c1d8b8558cb07f3a057b346.tar.gz emacs-566df3fcac8010303c1d8b8558cb07f3a057b346.zip | |
Clean up syntax-table usage in xml.el
* xml.el (xml--parse-buffer): Use xml-syntax-table.
(xml-parse-tag): Likewise, and avoid changing entity tables.
(xml-syntax-table): Define from scratch, making sure not to give
x2000 and other Unicode spaces whitespace syntax, since those are
not spaces in XML.
(xml-parse-fragment): Delete unused function.
(xml-name-start-char-re, xml-name-char-re, xml-name-re)
(xml-names-re, xml-nmtoken-re, xml-nmtokens-re, xml-char-ref-re)
(xml-entity-ref, xml-pe-reference-re)
(xml-reference-re,xml-att-value-re, xml-tokenized-type-re)
(xml-notation-type-re, xml-enumeration-re, xml-enumerated-type-re)
(xml-att-type-re, xml-default-decl-re, xml-att-def-re)
(xml-entity-value-re): Use syntax references in regexps where
possible; no need to define inside a let-binding.
(xml-parse-dtd): Use xml-pe-reference-re.
(xml-entity-or-char-ref-re): New defconst.
(xml-parse-string, xml-substitute-special): Use it.
| -rw-r--r-- | lisp/ChangeLog | 20 | ||||
| -rw-r--r-- | lisp/xml.el | 377 | ||||
| -rw-r--r-- | test/automated/xml-parse-tests.el | 16 |
3 files changed, 222 insertions, 191 deletions
diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 0a486daa809..8cef65cb10c 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog | |||
| @@ -1,3 +1,23 @@ | |||
| 1 | 2012-07-04 Chong Yidong <cyd@gnu.org> | ||
| 2 | |||
| 3 | * xml.el (xml--parse-buffer): Use xml-syntax-table. | ||
| 4 | (xml-parse-tag): Likewise, and avoid changing entity tables. | ||
| 5 | (xml-syntax-table): Define from scratch, making sure not to give | ||
| 6 | x2000 and other Unicode spaces whitespace syntax, since those are | ||
| 7 | not spaces in XML. | ||
| 8 | (xml-parse-fragment): Delete unused function. | ||
| 9 | (xml-name-start-char-re, xml-name-char-re, xml-name-re) | ||
| 10 | (xml-names-re, xml-nmtoken-re, xml-nmtokens-re, xml-char-ref-re) | ||
| 11 | (xml-entity-ref, xml-pe-reference-re) | ||
| 12 | (xml-reference-re,xml-att-value-re, xml-tokenized-type-re) | ||
| 13 | (xml-notation-type-re, xml-enumeration-re, xml-enumerated-type-re) | ||
| 14 | (xml-att-type-re, xml-default-decl-re, xml-att-def-re) | ||
| 15 | (xml-entity-value-re): Use syntax references in regexps where | ||
| 16 | possible; no need to define inside a let-binding. | ||
| 17 | (xml-parse-dtd): Use xml-pe-reference-re. | ||
| 18 | (xml-entity-or-char-ref-re): New defconst. | ||
| 19 | (xml-parse-string, xml-substitute-special): Use it. | ||
| 20 | |||
| 1 | 2012-07-04 Stefan Monnier <monnier@iro.umontreal.ca> | 21 | 2012-07-04 Stefan Monnier <monnier@iro.umontreal.ca> |
| 2 | 22 | ||
| 3 | * files.el (locate-dominating-file): Allow `name' to be a predicate. | 23 | * files.el (locate-dominating-file): Allow `name' to be a predicate. |
diff --git a/lisp/xml.el b/lisp/xml.el index f2c1a703f88..e2788e5e756 100644 --- a/lisp/xml.el +++ b/lisp/xml.el | |||
| @@ -164,93 +164,107 @@ An empty string is returned if the attribute was not found. | |||
| 164 | See also `xml-get-attribute-or-nil'." | 164 | See also `xml-get-attribute-or-nil'." |
| 165 | (or (xml-get-attribute-or-nil node attribute) "")) | 165 | (or (xml-get-attribute-or-nil node attribute) "")) |
| 166 | 166 | ||
| 167 | ;;; Creating the list | 167 | ;;; Regular expressions for XML components |
| 168 | |||
| 169 | ;;;###autoload | ||
| 170 | (defun xml-parse-file (file &optional parse-dtd parse-ns) | ||
| 171 | "Parse the well-formed XML file FILE. | ||
| 172 | Return the top node with all its children. | ||
| 173 | If PARSE-DTD is non-nil, the DTD is parsed rather than skipped. | ||
| 174 | If PARSE-NS is non-nil, then QNAMES are expanded." | ||
| 175 | (with-temp-buffer | ||
| 176 | (insert-file-contents file) | ||
| 177 | (xml--parse-buffer parse-dtd parse-ns))) | ||
| 178 | 168 | ||
| 169 | ;; The following regexps are used as subexpressions in regexps that | ||
| 170 | ;; are `eval-when-compile'd for efficiency, so they must be defined at | ||
| 171 | ;; compile time. | ||
| 179 | (eval-and-compile | 172 | (eval-and-compile |
| 180 | (let* ((start-chars (concat "[:alpha:]:_")) | 173 | |
| 181 | (name-chars (concat "-[:digit:]." start-chars)) | 174 | ;; [4] NameStartChar |
| 182 | ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+ | 175 | ;; See the definition of word syntax in `xml-syntax-table'. |
| 183 | (whitespace "[ \t\n\r]")) | 176 | (defconst xml-name-start-char-re (concat "[[:word:]:_]")) |
| 184 | ;; [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | 177 | |
| 185 | ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | 178 | ;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
| 186 | ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | 179 | ;; | [#x0300-#x036F] | [#x203F-#x2040] |
| 187 | ;; | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | 180 | (defconst xml-name-char-re (concat "[-0-9.[:word:]:_·̀-ͯ‿-⁀]")) |
| 188 | ;; | [#x10000-#xEFFFF] | 181 | |
| 189 | (defconst xml-name-start-char-re (concat "[" start-chars "]")) | 182 | ;; [5] Name ::= NameStartChar (NameChar)* |
| 190 | ;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | 183 | (defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*")) |
| 191 | ;; | [#x0300-#x036F] | [#x203F-#x2040] | 184 | |
| 192 | (defconst xml-name-char-re (concat "[" name-chars "]")) | 185 | ;; [6] Names ::= Name (#x20 Name)* |
| 193 | ;; [5] Name ::= NameStartChar (NameChar)* | 186 | (defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*")) |
| 194 | (defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*")) | 187 | |
| 195 | ;; [6] Names ::= Name (#x20 Name)* | 188 | ;; [7] Nmtoken ::= (NameChar)+ |
| 196 | (defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*")) | 189 | (defconst xml-nmtoken-re (concat xml-name-char-re "+")) |
| 197 | ;; [7] Nmtoken ::= (NameChar)+ | 190 | |
| 198 | (defconst xml-nmtoken-re (concat xml-name-char-re "+")) | 191 | ;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* |
| 199 | ;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* | 192 | (defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*")) |
| 200 | (defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*")) | 193 | |
| 201 | ;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' | 194 | ;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' |
| 202 | (defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)") | 195 | (defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)") |
| 203 | ;; [68] EntityRef ::= '&' Name ';' | 196 | |
| 204 | (defconst xml-entity-ref (concat "&" xml-name-re ";")) | 197 | ;; [68] EntityRef ::= '&' Name ';' |
| 205 | ;; [69] PEReference ::= '%' Name ';' | 198 | (defconst xml-entity-ref (concat "&" xml-name-re ";")) |
| 206 | (defconst xml-pe-reference-re (concat "%" xml-name-re ";")) | 199 | |
| 207 | ;; [67] Reference ::= EntityRef | CharRef | 200 | (defconst xml-entity-or-char-ref-re (concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\(" |
| 208 | (defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)")) | 201 | xml-name-re "\\)\\);")) |
| 209 | ;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" | 202 | |
| 210 | (defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|" | 203 | ;; [69] PEReference ::= '%' Name ';' |
| 211 | "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)")) | 204 | (defconst xml-pe-reference-re (concat "%\\(" xml-name-re "\\);")) |
| 212 | ;; [56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default] | 205 | |
| 213 | ;; | 'IDREF' [VC: IDREF] | 206 | ;; [67] Reference ::= EntityRef | CharRef |
| 214 | ;; | 'IDREFS' [VC: IDREF] | 207 | (defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)")) |
| 215 | ;; | 'ENTITY' [VC: Entity Name] | 208 | |
| 216 | ;; | 'ENTITIES' [VC: Entity Name] | 209 | ;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"' |
| 217 | ;; | 'NMTOKEN' [VC: Name Token] | 210 | ;; | "'" ([^<&'] | Reference)* "'" |
| 218 | ;; | 'NMTOKENS' [VC: Name Token] | 211 | (defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" |
| 219 | (defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|" | 212 | xml-reference-re "\\)*\"\\|" |
| 220 | "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")) | 213 | "'\\(?:[^&']\\|" xml-reference-re |
| 221 | ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' | 214 | "\\)*'\\)")) |
| 222 | (defconst xml-notation-type-re | 215 | |
| 223 | (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re | 216 | ;; [56] TokenizedType ::= 'ID' |
| 224 | "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" | 217 | ;; [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default] |
| 225 | whitespace "*)\\)")) | 218 | ;; | 'IDREF' [VC: IDREF] |
| 226 | ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' | 219 | ;; | 'IDREFS' [VC: IDREF] |
| 227 | ;; [VC: Enumeration] [VC: No Duplicate Tokens] | 220 | ;; | 'ENTITY' [VC: Entity Name] |
| 228 | (defconst xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re | 221 | ;; | 'ENTITIES' [VC: Entity Name] |
| 229 | "\\(?:" whitespace "*|" whitespace "*" | 222 | ;; | 'NMTOKEN' [VC: Name Token] |
| 230 | xml-nmtoken-re "\\)*" | 223 | ;; | 'NMTOKENS' [VC: Name Token] |
| 231 | whitespace ")\\)")) | 224 | (defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|" |
| 232 | ;; [57] EnumeratedType ::= NotationType | Enumeration | 225 | "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")) |
| 233 | (defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re | 226 | |
| 234 | "\\|" xml-enumeration-re "\\)")) | 227 | ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' |
| 235 | ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType | 228 | (defconst xml-notation-type-re |
| 236 | ;; [55] StringType ::= 'CDATA' | 229 | (concat "\\(?:NOTATION\\s-+(\\s-*" xml-name-re |
| 237 | (defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re | 230 | "\\(?:\\s-*|\\s-*" xml-name-re "\\)*\\s-*)\\)")) |
| 238 | "\\|" xml-notation-type-re | 231 | |
| 239 | "\\|" xml-enumerated-type-re "\\)")) | 232 | ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' |
| 240 | ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) | 233 | ;; [VC: Enumeration] [VC: No Duplicate Tokens] |
| 241 | (defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" | 234 | (defconst xml-enumeration-re (concat "\\(?:(\\s-*" xml-nmtoken-re |
| 242 | whitespace "\\)*" xml-att-value-re "\\)")) | 235 | "\\(?:\\s-*|\\s-*" xml-nmtoken-re |
| 243 | ;; [53] AttDef ::= S Name S AttType S DefaultDecl | 236 | "\\)*\\s-+)\\)")) |
| 244 | (defconst xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re | 237 | |
| 245 | whitespace "*" xml-att-type-re | 238 | ;; [57] EnumeratedType ::= NotationType | Enumeration |
| 246 | whitespace "*" xml-default-decl-re "\\)")) | 239 | (defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re |
| 247 | ;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | 240 | "\\|" xml-enumeration-re "\\)")) |
| 248 | ;; | "'" ([^%&'] | PEReference | Reference)* "'" | 241 | |
| 249 | (defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re | 242 | ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType |
| 250 | "\\|" xml-reference-re | 243 | ;; [55] StringType ::= 'CDATA' |
| 251 | "\\)*\"\\|'\\(?:[^%&']\\|" | 244 | (defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re |
| 252 | xml-pe-reference-re "\\|" | 245 | "\\|" xml-notation-type-re |
| 253 | xml-reference-re "\\)*'\\)")))) | 246 | "\\|" xml-enumerated-type-re "\\)")) |
| 247 | |||
| 248 | ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) | ||
| 249 | (defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|" | ||
| 250 | "\\(?:#FIXED\\s-+\\)*" | ||
| 251 | xml-att-value-re "\\)")) | ||
| 252 | |||
| 253 | ;; [53] AttDef ::= S Name S AttType S DefaultDecl | ||
| 254 | (defconst xml-att-def-re (concat "\\(?:\\s-*" xml-name-re | ||
| 255 | "\\s-*" xml-att-type-re | ||
| 256 | "\\s-*" xml-default-decl-re "\\)")) | ||
| 257 | |||
| 258 | ;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | ||
| 259 | ;; | "'" ([^%&'] | PEReference | Reference)* "'" | ||
| 260 | (defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" | ||
| 261 | xml-pe-reference-re | ||
| 262 | "\\|" xml-reference-re | ||
| 263 | "\\)*\"\\|'\\(?:[^%&']\\|" | ||
| 264 | xml-pe-reference-re "\\|" | ||
| 265 | xml-reference-re "\\)*'\\)")) | ||
| 266 | ) ; End of `eval-when-compile' | ||
| 267 | |||
| 254 | 268 | ||
| 255 | ;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 269 | ;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral |
| 256 | ;; | 'PUBLIC' S PubidLiteral S SystemLiteral | 270 | ;; | 'PUBLIC' S PubidLiteral S SystemLiteral |
| @@ -263,53 +277,59 @@ If PARSE-NS is non-nil, then QNAMES are expanded." | |||
| 263 | 277 | ||
| 264 | ;; Note that this is setup so that we can do whitespace-skipping with | 278 | ;; Note that this is setup so that we can do whitespace-skipping with |
| 265 | ;; `(skip-syntax-forward " ")', inter alia. Previously this was slow | 279 | ;; `(skip-syntax-forward " ")', inter alia. Previously this was slow |
| 266 | ;; compared with `re-search-forward', but that has been fixed. Also | 280 | ;; compared with `re-search-forward', but that has been fixed. |
| 267 | ;; note that the standard syntax table contains other characters with | ||
| 268 | ;; whitespace syntax, like NBSP, but they are invalid in contexts in | ||
| 269 | ;; which we might skip whitespace -- specifically, they're not | ||
| 270 | ;; NameChars [XML 4]. | ||
| 271 | 281 | ||
| 272 | (defvar xml-syntax-table | 282 | (defvar xml-syntax-table |
| 273 | (let ((table (make-syntax-table))) | 283 | ;; By default, characters have symbol syntax. |
| 274 | ;; Get space syntax correct per XML [3]. | 284 | (let ((table (make-char-table 'syntax-table '(3)))) |
| 275 | (dotimes (c 31) | 285 | ;; The XML space chars [3], and nothing else, have space syntax. |
| 276 | (modify-syntax-entry c "." table)) ; all are space in standard table | 286 | (dolist (c '(?\s ?\t ?\r ?\n)) |
| 277 | (dolist (c '(?\t ?\n ?\r)) ; these should be space | ||
| 278 | (modify-syntax-entry c " " table)) | 287 | (modify-syntax-entry c " " table)) |
| 279 | ;; For skipping attributes. | 288 | ;; The characters in NameStartChar [4], aside from ':' and '_', |
| 280 | (modify-syntax-entry ?\" "\"" table) | 289 | ;; have word syntax. This is used by `xml-name-start-char-re'. |
| 281 | (modify-syntax-entry ?' "\"" table) | 290 | (modify-syntax-entry '(?A . ?Z) "w" table) |
| 282 | ;; Non-alnum name chars should be symbol constituents (`-' and `_' | 291 | (modify-syntax-entry '(?a . ?z) "w" table) |
| 283 | ;; are OK by default). | 292 | (modify-syntax-entry '(#xC0 . #xD6) "w" table) |
| 284 | (modify-syntax-entry ?. "_" table) | 293 | (modify-syntax-entry '(#xD8 . #XF6) "w" table) |
| 285 | (modify-syntax-entry ?: "_" table) | 294 | (modify-syntax-entry '(#xF8 . #X2FF) "w" table) |
| 286 | ;; XML [89] | 295 | (modify-syntax-entry '(#x370 . #X37D) "w" table) |
| 287 | (unless (featurep 'xemacs) | 296 | (modify-syntax-entry '(#x37F . #x1FFF) "w" table) |
| 288 | (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005 | 297 | (modify-syntax-entry '(#x200C . #x200D) "w" table) |
| 289 | #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC | 298 | (modify-syntax-entry '(#x2070 . #x218F) "w" table) |
| 290 | #x30FD #x30FE)) | 299 | (modify-syntax-entry '(#x2C00 . #x2FEF) "w" table) |
| 291 | (modify-syntax-entry (decode-char 'ucs c) "w" table))) | 300 | (modify-syntax-entry '(#x3001 . #xD7FF) "w" table) |
| 292 | ;; Fixme: rest of [4] | 301 | (modify-syntax-entry '(#xF900 . #xFDCF) "w" table) |
| 302 | (modify-syntax-entry '(#xFDF0 . #xFFFD) "w" table) | ||
| 303 | (modify-syntax-entry '(#x10000 . #xEFFFF) "w" table) | ||
| 293 | table) | 304 | table) |
| 294 | "Syntax table used by `xml-parse-region'.") | 305 | "Syntax table used by the XML parser. |
| 306 | In this syntax table, the XML space characters [ \\t\\r\\n], and | ||
| 307 | only those characters, have whitespace syntax.") | ||
| 295 | 308 | ||
| 296 | ;; XML [5] | 309 | ;;; Entry points: |
| 297 | 310 | ||
| 298 | ;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e. | 311 | ;;;###autoload |
| 299 | ;; document ::= prolog element Misc* | 312 | (defun xml-parse-file (file &optional parse-dtd parse-ns) |
| 300 | ;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? | 313 | "Parse the well-formed XML file FILE. |
| 314 | Return the top node with all its children. | ||
| 315 | If PARSE-DTD is non-nil, the DTD is parsed rather than skipped. | ||
| 316 | If PARSE-NS is non-nil, then QNAMES are expanded." | ||
| 317 | (with-temp-buffer | ||
| 318 | (insert-file-contents file) | ||
| 319 | (xml--parse-buffer parse-dtd parse-ns))) | ||
| 301 | 320 | ||
| 302 | ;;;###autoload | 321 | ;;;###autoload |
| 303 | (defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns) | 322 | (defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns) |
| 304 | "Parse the region from BEG to END in BUFFER. | 323 | "Parse the region from BEG to END in BUFFER. |
| 324 | Return the XML parse tree, or raise an error if the region does | ||
| 325 | not contain well-formed XML. | ||
| 326 | |||
| 305 | If BEG is nil, it defaults to `point-min'. | 327 | If BEG is nil, it defaults to `point-min'. |
| 306 | If END is nil, it defaults to `point-max'. | 328 | If END is nil, it defaults to `point-max'. |
| 307 | If BUFFER is nil, it defaults to the current buffer. | 329 | If BUFFER is nil, it defaults to the current buffer. |
| 308 | Returns the XML list for the region, or raises an error if the region | 330 | If PARSE-DTD is non-nil, parse the DTD and return it as the first |
| 309 | is not well-formed XML. | 331 | element of the list. |
| 310 | If PARSE-DTD is non-nil, the DTD is parsed rather than skipped, | 332 | If PARSE-NS is non-nil, expand QNAMES." |
| 311 | and returned as the first element of the list. | ||
| 312 | If PARSE-NS is non-nil, then QNAMES are expanded." | ||
| 313 | ;; Use fixed syntax table to ensure regexp char classes and syntax | 333 | ;; Use fixed syntax table to ensure regexp char classes and syntax |
| 314 | ;; specs DTRT. | 334 | ;; specs DTRT. |
| 315 | (unless buffer | 335 | (unless buffer |
| @@ -318,8 +338,14 @@ If PARSE-NS is non-nil, then QNAMES are expanded." | |||
| 318 | (insert-buffer-substring-no-properties buffer beg end) | 338 | (insert-buffer-substring-no-properties buffer beg end) |
| 319 | (xml--parse-buffer parse-dtd parse-ns))) | 339 | (xml--parse-buffer parse-dtd parse-ns))) |
| 320 | 340 | ||
| 341 | ;; XML [5] | ||
| 342 | |||
| 343 | ;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e. | ||
| 344 | ;; document ::= prolog element Misc* | ||
| 345 | ;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? | ||
| 346 | |||
| 321 | (defun xml--parse-buffer (parse-dtd parse-ns) | 347 | (defun xml--parse-buffer (parse-dtd parse-ns) |
| 322 | (with-syntax-table (standard-syntax-table) | 348 | (with-syntax-table xml-syntax-table |
| 323 | (let ((case-fold-search nil) ; XML is case-sensitive. | 349 | (let ((case-fold-search nil) ; XML is case-sensitive. |
| 324 | ;; Prevent entity definitions from changing the defaults | 350 | ;; Prevent entity definitions from changing the defaults |
| 325 | (xml-entity-alist xml-entity-alist) | 351 | (xml-entity-alist xml-entity-alist) |
| @@ -374,22 +400,6 @@ specify that the name shouldn't be given a namespace." | |||
| 374 | (cons ns (if special "" lname))) | 400 | (cons ns (if special "" lname))) |
| 375 | (intern name))) | 401 | (intern name))) |
| 376 | 402 | ||
| 377 | (defun xml-parse-fragment (&optional parse-dtd parse-ns) | ||
| 378 | "Parse xml-like fragments." | ||
| 379 | (let ((xml-sub-parser t) | ||
| 380 | ;; Prevent entity definitions from changing the defaults | ||
| 381 | (xml-entity-alist xml-entity-alist) | ||
| 382 | (xml-parameter-entity-alist xml-parameter-entity-alist) | ||
| 383 | children) | ||
| 384 | (while (not (eobp)) | ||
| 385 | (let ((bit (xml-parse-tag-1 parse-dtd parse-ns))) | ||
| 386 | (if children | ||
| 387 | (setq children (append (list bit) children)) | ||
| 388 | (if (stringp bit) | ||
| 389 | (setq children (list bit)) | ||
| 390 | (setq children bit))))) | ||
| 391 | (reverse children))) | ||
| 392 | |||
| 393 | (defun xml-parse-tag (&optional parse-dtd parse-ns) | 403 | (defun xml-parse-tag (&optional parse-dtd parse-ns) |
| 394 | "Parse the tag at point. | 404 | "Parse the tag at point. |
| 395 | If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and | 405 | If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and |
| @@ -401,12 +411,17 @@ Return one of: | |||
| 401 | - a list : the matching node | 411 | - a list : the matching node |
| 402 | - nil : the point is not looking at a tag. | 412 | - nil : the point is not looking at a tag. |
| 403 | - a pair : the first element is the DTD, the second is the node." | 413 | - a pair : the first element is the DTD, the second is the node." |
| 404 | (let ((buf (current-buffer)) | 414 | (let* ((case-fold-search nil) |
| 405 | (pos (point))) | 415 | ;; Prevent entity definitions from changing the defaults |
| 416 | (xml-entity-alist xml-entity-alist) | ||
| 417 | (xml-parameter-entity-alist xml-parameter-entity-alist) | ||
| 418 | (buf (current-buffer)) | ||
| 419 | (pos (point))) | ||
| 406 | (with-temp-buffer | 420 | (with-temp-buffer |
| 407 | (insert-buffer-substring-no-properties buf pos) | 421 | (with-syntax-table xml-syntax-table |
| 408 | (goto-char (point-min)) | 422 | (insert-buffer-substring-no-properties buf pos) |
| 409 | (xml-parse-tag-1 parse-dtd parse-ns)))) | 423 | (goto-char (point-min)) |
| 424 | (xml-parse-tag-1 parse-dtd parse-ns))))) | ||
| 410 | 425 | ||
| 411 | (defun xml-parse-tag-1 (&optional parse-dtd parse-ns) | 426 | (defun xml-parse-tag-1 (&optional parse-dtd parse-ns) |
| 412 | "Like `xml-parse-tag', but possibly modify the buffer while working." | 427 | "Like `xml-parse-tag', but possibly modify the buffer while working." |
| @@ -530,40 +545,32 @@ references." | |||
| 530 | (skip-chars-forward "^<&") | 545 | (skip-chars-forward "^<&") |
| 531 | (when (eq (char-after) ?&) | 546 | (when (eq (char-after) ?&) |
| 532 | ;; If we find an entity or character reference, expand it. | 547 | ;; If we find an entity or character reference, expand it. |
| 533 | (unless (looking-at (eval-when-compile | 548 | (unless (looking-at xml-entity-or-char-ref-re) |
| 534 | (concat "&\\(?:#\\([0-9]+\\)\\|#x\\([0-9a-fA-F]+\\)\\|\\(" | ||
| 535 | xml-name-re "\\)\\);"))) | ||
| 536 | (error "XML: (Not Well-Formed) Invalid entity reference")) | 549 | (error "XML: (Not Well-Formed) Invalid entity reference")) |
| 537 | ;; For a character reference, the next entity or character | 550 | ;; For a character reference, the next entity or character |
| 538 | ;; reference must be after the replacement. [4.6] "Numerical | 551 | ;; reference must be after the replacement. [4.6] "Numerical |
| 539 | ;; character references are expanded immediately when | 552 | ;; character references are expanded immediately when |
| 540 | ;; recognized and MUST be treated as character data." | 553 | ;; recognized and MUST be treated as character data." |
| 541 | (cond ((setq ref (match-string 1)) | 554 | (if (setq ref (match-string 2)) |
| 542 | ;; Decimal character reference | 555 | (progn ; Numeric char reference |
| 543 | (setq val (save-match-data | 556 | (setq val (save-match-data |
| 544 | (decode-char 'ucs (string-to-number ref)))) | 557 | (decode-char 'ucs (string-to-number |
| 545 | (and (null val) | 558 | ref (if (match-string 1) 16))))) |
| 546 | xml-validating-parser | 559 | (and (null val) |
| 547 | (error "XML: (Validity) Invalid character `%s'" ref)) | 560 | xml-validating-parser |
| 548 | (replace-match (or (string val) xml-undefined-entity) t t)) | 561 | (error "XML: (Validity) Invalid character reference `%s'" |
| 549 | ;; Hexadecimal character reference | 562 | (match-string 0))) |
| 550 | ((setq ref (match-string 2)) | 563 | (replace-match (or (string val) xml-undefined-entity) t t)) |
| 551 | (setq val (save-match-data | 564 | ;; For an entity reference, search again from the start of |
| 552 | (decode-char 'ucs (string-to-number ref 16)))) | 565 | ;; the replaced text, since the replacement can contain |
| 553 | (and (null val) | 566 | ;; entity or character references, or markup. |
| 554 | xml-validating-parser | 567 | (setq ref (match-string 3) |
| 555 | (error "XML: (Validity) Invalid character `x%s'" ref)) | 568 | val (assoc ref xml-entity-alist)) |
| 556 | (replace-match (or (string val) xml-undefined-entity) t t)) | 569 | (and (null val) |
| 557 | ;; For an entity reference, search again from the start | 570 | xml-validating-parser |
| 558 | ;; of the replaced text, since the replacement can | 571 | (error "XML: (Validity) Undefined entity `%s'" ref)) |
| 559 | ;; contain entity or character references, or markup. | 572 | (replace-match (cdr val) t t) |
| 560 | ((setq ref (match-string 3)) | 573 | (goto-char (match-beginning 0))) |
| 561 | (setq val (assoc ref xml-entity-alist)) | ||
| 562 | (and (null val) | ||
| 563 | xml-validating-parser | ||
| 564 | (error "XML: (Validity) Undefined entity `%s'" ref)) | ||
| 565 | (replace-match (cdr val) t t) | ||
| 566 | (goto-char (match-beginning 0)))) | ||
| 567 | ;; Check for XML bombs. | 574 | ;; Check for XML bombs. |
| 568 | (and xml-entity-expansion-limit | 575 | (and xml-entity-expansion-limit |
| 569 | (> (- (buffer-size) (point)) | 576 | (> (- (buffer-size) (point)) |
| @@ -610,8 +617,9 @@ Leave point at the first non-blank character after the tag." | |||
| 610 | (replace-regexp-in-string "\\s-\\{2,\\}" " " string) | 617 | (replace-regexp-in-string "\\s-\\{2,\\}" " " string) |
| 611 | (let ((expansion (xml-substitute-special string))) | 618 | (let ((expansion (xml-substitute-special string))) |
| 612 | (unless (stringp expansion) | 619 | (unless (stringp expansion) |
| 613 | ; We say this is the constraint. It is actually that neither | 620 | ;; We say this is the constraint. It is actually that |
| 614 | ; external entities nor "<" can be in an attribute value. | 621 | ;; neither external entities nor "<" can be in an |
| 622 | ;; attribute value. | ||
| 615 | (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements")) | 623 | (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements")) |
| 616 | (push (cons name expansion) attlist))) | 624 | (push (cons name expansion) attlist))) |
| 617 | 625 | ||
| @@ -643,8 +651,6 @@ This follows the rule [28] in the XML specifications." | |||
| 643 | (looking-at xml-name-re) | 651 | (looking-at xml-name-re) |
| 644 | (let ((dtd (list (match-string-no-properties 0) 'dtd)) | 652 | (let ((dtd (list (match-string-no-properties 0) 'dtd)) |
| 645 | (xml-parameter-entity-alist xml-parameter-entity-alist) | 653 | (xml-parameter-entity-alist xml-parameter-entity-alist) |
| 646 | (parameter-entity-re (eval-when-compile | ||
| 647 | (concat "%\\(" xml-name-re "\\);"))) | ||
| 648 | next-parameter-entity) | 654 | next-parameter-entity) |
| 649 | (goto-char (match-end 0)) | 655 | (goto-char (match-end 0)) |
| 650 | (skip-syntax-forward " ") | 656 | (skip-syntax-forward " ") |
| @@ -693,7 +699,7 @@ This follows the rule [28] in the XML specifications." | |||
| 693 | ;; and try again. | 699 | ;; and try again. |
| 694 | (setq next-parameter-entity | 700 | (setq next-parameter-entity |
| 695 | (save-excursion | 701 | (save-excursion |
| 696 | (if (re-search-forward parameter-entity-re nil t) | 702 | (if (re-search-forward xml-pe-reference-re nil t) |
| 697 | (match-beginning 0)))) | 703 | (match-beginning 0)))) |
| 698 | 704 | ||
| 699 | ;; Parse the rest of the DTD | 705 | ;; Parse the rest of the DTD |
| @@ -752,7 +758,7 @@ This follows the rule [28] in the XML specifications." | |||
| 752 | (> (point) next-parameter-entity) | 758 | (> (point) next-parameter-entity) |
| 753 | (setq next-parameter-entity | 759 | (setq next-parameter-entity |
| 754 | (save-excursion | 760 | (save-excursion |
| 755 | (if (re-search-forward parameter-entity-re nil t) | 761 | (if (re-search-forward xml-pe-reference-re nil t) |
| 756 | (match-beginning 0)))))) | 762 | (match-beginning 0)))))) |
| 757 | 763 | ||
| 758 | ;; Internal entity declarations: | 764 | ;; Internal entity declarations: |
| @@ -796,7 +802,7 @@ This follows the rule [28] in the XML specifications." | |||
| 796 | (next-parameter-entity | 802 | (next-parameter-entity |
| 797 | (save-excursion | 803 | (save-excursion |
| 798 | (goto-char next-parameter-entity) | 804 | (goto-char next-parameter-entity) |
| 799 | (unless (looking-at parameter-entity-re) | 805 | (unless (looking-at xml-pe-reference-re) |
| 800 | (error "XML: Internal error")) | 806 | (error "XML: Internal error")) |
| 801 | (let* ((entity (match-string 1)) | 807 | (let* ((entity (match-string 1)) |
| 802 | (beg (point-marker)) | 808 | (beg (point-marker)) |
| @@ -808,7 +814,7 @@ This follows the rule [28] in the XML specifications." | |||
| 808 | (goto-char next-parameter-entity)) | 814 | (goto-char next-parameter-entity)) |
| 809 | (goto-char (match-end 0)))) | 815 | (goto-char (match-end 0)))) |
| 810 | (setq next-parameter-entity | 816 | (setq next-parameter-entity |
| 811 | (if (re-search-forward parameter-entity-re nil t) | 817 | (if (re-search-forward xml-pe-reference-re nil t) |
| 812 | (match-beginning 0))))) | 818 | (match-beginning 0))))) |
| 813 | 819 | ||
| 814 | ;; Anything else is garbage (ignored if not validating). | 820 | ;; Anything else is garbage (ignored if not validating). |
| @@ -889,20 +895,17 @@ references and parameter-entity references." | |||
| 889 | (defun xml-substitute-special (string) | 895 | (defun xml-substitute-special (string) |
| 890 | "Return STRING, after substituting entity and character references. | 896 | "Return STRING, after substituting entity and character references. |
| 891 | STRING is assumed to occur in an XML attribute value." | 897 | STRING is assumed to occur in an XML attribute value." |
| 892 | (let ((ref-re (eval-when-compile | 898 | (let ((strlen (length string)) |
| 893 | (concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\(" | ||
| 894 | xml-name-re "\\)\\);"))) | ||
| 895 | (strlen (length string)) | ||
| 896 | children) | 899 | children) |
| 897 | (while (string-match ref-re string) | 900 | (while (string-match xml-entity-or-char-ref-re string) |
| 898 | (push (substring string 0 (match-beginning 0)) children) | 901 | (push (substring string 0 (match-beginning 0)) children) |
| 899 | (let* ((remainder (substring string (match-end 0))) | 902 | (let* ((remainder (substring string (match-end 0))) |
| 900 | (ref (match-string 2 string))) | 903 | (is-hex (match-string 1 string)) ; Is it a hex numeric reference? |
| 904 | (ref (match-string 2 string))) ; Numeric part of reference | ||
| 901 | (if ref | 905 | (if ref |
| 902 | ;; [4.6] Character references are included as | 906 | ;; [4.6] Character references are included as |
| 903 | ;; character data. | 907 | ;; character data. |
| 904 | (let ((val (decode-char 'ucs (string-to-number | 908 | (let ((val (decode-char 'ucs (string-to-number ref (if is-hex 16))))) |
| 905 | ref (if (match-string 1 string) 16))))) | ||
| 906 | (push (cond (val (string val)) | 909 | (push (cond (val (string val)) |
| 907 | (xml-validating-parser | 910 | (xml-validating-parser |
| 908 | (error "XML: (Validity) Undefined character `x%s'" ref)) | 911 | (error "XML: (Validity) Undefined character `x%s'" ref)) |
| @@ -913,7 +916,7 @@ STRING is assumed to occur in an XML attribute value." | |||
| 913 | ;; [4.4.5] Entity references are "included in literal". | 916 | ;; [4.4.5] Entity references are "included in literal". |
| 914 | ;; Note that we don't need do anything special to treat | 917 | ;; Note that we don't need do anything special to treat |
| 915 | ;; quotes as normal data characters. | 918 | ;; quotes as normal data characters. |
| 916 | (setq ref (match-string 3 string)) | 919 | (setq ref (match-string 3 string)) ; entity name |
| 917 | (let ((val (or (cdr (assoc ref xml-entity-alist)) | 920 | (let ((val (or (cdr (assoc ref xml-entity-alist)) |
| 918 | (if xml-validating-parser | 921 | (if xml-validating-parser |
| 919 | (error "XML: (Validity) Undefined entity `%s'" ref) | 922 | (error "XML: (Validity) Undefined entity `%s'" ref) |
diff --git a/test/automated/xml-parse-tests.el b/test/automated/xml-parse-tests.el index ada9bbd4074..e6553060345 100644 --- a/test/automated/xml-parse-tests.el +++ b/test/automated/xml-parse-tests.el | |||
| @@ -30,10 +30,10 @@ | |||
| 30 | (require 'xml) | 30 | (require 'xml) |
| 31 | 31 | ||
| 32 | (defvar xml-parse-tests--data | 32 | (defvar xml-parse-tests--data |
| 33 | '(;; General entity substitution | 33 | `(;; General entity substitution |
| 34 | ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY ent \"AbC\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" . | 34 | ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY ent \"AbC\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" . |
| 35 | ((foo ((a . "b")) (bar nil "AbC;")))) | 35 | ((foo ((a . "b")) (bar nil "AbC;")))) |
| 36 | ("<?xml version=\"1.0\"?><foo>&amp;&apos;'<>"</foo>" . | 36 | ("<?xml version=\"1.0\"?><foo>&amp;&apos;'<>"</foo>" . |
| 37 | ((foo () "&''<>\""))) | 37 | ((foo () "&''<>\""))) |
| 38 | ;; Parameter entity substitution | 38 | ;; Parameter entity substitution |
| 39 | ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY % pent \"AbC\"><!ENTITY ent \"%pent;\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" . | 39 | ("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY % pent \"AbC\"><!ENTITY ent \"%pent;\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" . |
| @@ -52,7 +52,11 @@ | |||
| 52 | ((foo ((a . "-aBc-")) "1"))) | 52 | ((foo ((a . "-aBc-")) "1"))) |
| 53 | ;; Character references must be treated as character data | 53 | ;; Character references must be treated as character data |
| 54 | ("<foo>AT&T;</foo>" . ((foo () "AT&T;"))) | 54 | ("<foo>AT&T;</foo>" . ((foo () "AT&T;"))) |
| 55 | ("<foo>&amp;</foo>" . ((foo () "&")))) | 55 | ("<foo>&amp;</foo>" . ((foo () "&"))) |
| 56 | ("<foo>&amp;</foo>" . ((foo () "&"))) | ||
| 57 | ;; Unusual but valid XML names [5] | ||
| 58 | ("<ÀÖØö.3·-‿⁀>abc</ÀÖØö.3·-‿⁀>" . ((,(intern "ÀÖØö.3·-‿⁀") () "abc"))) | ||
| 59 | ("<:>abc</:>" . ((,(intern ":") () "abc")))) | ||
| 56 | "Alist of XML strings and their expected parse trees.") | 60 | "Alist of XML strings and their expected parse trees.") |
| 57 | 61 | ||
| 58 | (defvar xml-parse-tests--bad-data | 62 | (defvar xml-parse-tests--bad-data |
| @@ -63,7 +67,11 @@ | |||
| 63 | ;; Non-terminating DTD | 67 | ;; Non-terminating DTD |
| 64 | "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">" | 68 | "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">" |
| 65 | "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf" | 69 | "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf" |
| 66 | "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf&abc;") | 70 | "<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">asdf&abc;" |
| 71 | ;; Invalid XML names | ||
| 72 | "<0foo>abc</0foo>" | ||
| 73 | "<‿foo>abc</‿foo>" | ||
| 74 | "<f¿>abc</f¿>") | ||
| 67 | "List of XML strings that should signal an error in the parser") | 75 | "List of XML strings that should signal an error in the parser") |
| 68 | 76 | ||
| 69 | (ert-deftest xml-parse-tests () | 77 | (ert-deftest xml-parse-tests () |