diff options
| author | Juanma Barranquero | 2003-03-16 10:48:34 +0000 |
|---|---|---|
| committer | Juanma Barranquero | 2003-03-16 10:48:34 +0000 |
| commit | a158ff811faadd2891b3db1f9b204388ac640c60 (patch) | |
| tree | bf01f678fa65d63a8428aaa51661a5ea7b33fe70 | |
| parent | 906cbe4568735cd8184b72399588c13918111346 (diff) | |
| download | emacs-a158ff811faadd2891b3db1f9b204388ac640c60.tar.gz emacs-a158ff811faadd2891b3db1f9b204388ac640c60.zip | |
(xml-ucs-to-string): New function to convert Unicode codepoints to strings.
Uses decode-char (mule.el) if available.
(xml-parse-tag, xml-parse-attlist, xml-skip-dtd, xml-parse-dtd,
xml-parse-elem-type): Use ' \t\n\r' instead of '[:space:]'.
(xml-parse-attlist): Added attribute normalization.
(xml-parse-tag): Replace "\r\n" and "\r" with "\n".
| -rw-r--r-- | lisp/ChangeLog | 9 | ||||
| -rw-r--r-- | lisp/xml.el | 122 |
2 files changed, 93 insertions, 38 deletions
diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 91dafdfbcd4..895ef1bd5bb 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog | |||
| @@ -1,3 +1,12 @@ | |||
| 1 | 2003-03-15 Mark A. Hershberger <mah@everybody.org> | ||
| 2 | |||
| 3 | * xml.el (xml-ucs-to-string): New function to convert Unicode | ||
| 4 | codepoints to strings. Uses decode-char (mule.el) if available. | ||
| 5 | (xml-parse-tag, xml-parse-attlist, xml-skip-dtd, xml-parse-dtd) | ||
| 6 | (xml-parse-elem-type): Use ' \t\n\r' instead of '[:space:]'. | ||
| 7 | (xml-parse-attlist): Added attribute normalization. | ||
| 8 | (xml-parse-tag): Replace "\r\n" and "\r" with "\n". | ||
| 9 | |||
| 1 | 2003-03-14 John Paul Wallington <jpw@gnu.org> | 10 | 2003-03-14 John Paul Wallington <jpw@gnu.org> |
| 2 | 11 | ||
| 3 | * files.el (recover-session): Error if there are no previous | 12 | * files.el (recover-session): Error if there are no previous |
diff --git a/lisp/xml.el b/lisp/xml.el index d6a0bc74b45..fc6365b50a0 100644 --- a/lisp/xml.el +++ b/lisp/xml.el | |||
| @@ -184,7 +184,7 @@ Returns one of: | |||
| 184 | ;; beginning of a document) | 184 | ;; beginning of a document) |
| 185 | ((looking-at "<\\?") | 185 | ((looking-at "<\\?") |
| 186 | (search-forward "?>" end) | 186 | (search-forward "?>" end) |
| 187 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 187 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) |
| 188 | (xml-parse-tag end)) | 188 | (xml-parse-tag end)) |
| 189 | ;; Character data (CDATA) sections, in which no tag should be interpreted | 189 | ;; Character data (CDATA) sections, in which no tag should be interpreted |
| 190 | ((looking-at "<!\\[CDATA\\[") | 190 | ((looking-at "<!\\[CDATA\\[") |
| @@ -198,7 +198,7 @@ Returns one of: | |||
| 198 | (if parse-dtd | 198 | (if parse-dtd |
| 199 | (setq dtd (xml-parse-dtd end)) | 199 | (setq dtd (xml-parse-dtd end)) |
| 200 | (xml-skip-dtd end)) | 200 | (xml-skip-dtd end)) |
| 201 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 201 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) |
| 202 | (if dtd | 202 | (if dtd |
| 203 | (cons dtd (xml-parse-tag end)) | 203 | (cons dtd (xml-parse-tag end)) |
| 204 | (xml-parse-tag end)))) | 204 | (xml-parse-tag end)))) |
| @@ -210,7 +210,7 @@ Returns one of: | |||
| 210 | ((looking-at "</") | 210 | ((looking-at "</") |
| 211 | '()) | 211 | '()) |
| 212 | ;; opening tag | 212 | ;; opening tag |
| 213 | ((looking-at "<\\([^/>[:space:]]+\\)") | 213 | ((looking-at "<\\([^/> \t\n\r]+\\)") |
| 214 | (goto-char (match-end 1)) | 214 | (goto-char (match-end 1)) |
| 215 | (let* ((case-fold-search nil) ;; XML is case-sensitive. | 215 | (let* ((case-fold-search nil) ;; XML is case-sensitive. |
| 216 | (node-name (match-string 1)) | 216 | (node-name (match-string 1)) |
| @@ -219,7 +219,7 @@ Returns one of: | |||
| 219 | pos) | 219 | pos) |
| 220 | 220 | ||
| 221 | ;; is this an empty element ? | 221 | ;; is this an empty element ? |
| 222 | (if (looking-at "/[[:space:]]*>") | 222 | (if (looking-at "/[ \t\n\r]*>") |
| 223 | (progn | 223 | (progn |
| 224 | (forward-char 2) | 224 | (forward-char 2) |
| 225 | (nreverse (cons '("") children))) | 225 | (nreverse (cons '("") children))) |
| @@ -230,7 +230,7 @@ Returns one of: | |||
| 230 | (forward-char 1) | 230 | (forward-char 1) |
| 231 | ;; Now check that we have the right end-tag. Note that this | 231 | ;; Now check that we have the right end-tag. Note that this |
| 232 | ;; one might contain spaces after the tag name | 232 | ;; one might contain spaces after the tag name |
| 233 | (while (not (looking-at (concat "</" node-name "[[:space:]]*>"))) | 233 | (while (not (looking-at (concat "</" node-name "[ \t\n\r]*>"))) |
| 234 | (cond | 234 | (cond |
| 235 | ((looking-at "</") | 235 | ((looking-at "</") |
| 236 | (error (concat | 236 | (error (concat |
| @@ -248,12 +248,14 @@ Returns one of: | |||
| 248 | (let ((string (buffer-substring-no-properties pos (point))) | 248 | (let ((string (buffer-substring-no-properties pos (point))) |
| 249 | (pos 0)) | 249 | (pos 0)) |
| 250 | 250 | ||
| 251 | ;; Clean up the string (no newline characters) | 251 | ;; Clean up the string. As per XML |
| 252 | ;; Not done, since as per XML specifications, the XML processor | 252 | ;; specifications, the XML processor should |
| 253 | ;; should always pass the whole string to the application. | 253 | ;; always pass the whole string to the |
| 254 | ;; (while (string-match "\\s +" string pos) | 254 | ;; application. But \r's should be replaced: |
| 255 | ;; (setq string (replace-match " " t t string)) | 255 | ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends |
| 256 | ;; (setq pos (1+ (match-beginning 0)))) | 256 | (while (string-match "\r\n?" string pos) |
| 257 | (setq string (replace-match "\n" t t string)) | ||
| 258 | (setq pos (1+ (match-beginning 0)))) | ||
| 257 | 259 | ||
| 258 | (setq string (xml-substitute-special string)) | 260 | (setq string (xml-substitute-special string)) |
| 259 | (setq children | 261 | (setq children |
| @@ -280,28 +282,44 @@ Returns one of: | |||
| 280 | The search for attributes end at the position END in the current buffer. | 282 | The search for attributes end at the position END in the current buffer. |
| 281 | Leaves the point on the first non-blank character after the tag." | 283 | Leaves the point on the first non-blank character after the tag." |
| 282 | (let ((attlist ()) | 284 | (let ((attlist ()) |
| 283 | name) | 285 | start-pos name) |
| 284 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 286 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) |
| 285 | (while (looking-at "\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[[:space:]]*=[[:space:]]*") | 287 | (while (looking-at "\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[ \t\n\r]*=[ \t\n\r]*") |
| 286 | (setq name (intern (match-string 1))) | 288 | (setq name (intern (match-string 1))) |
| 287 | (goto-char (match-end 0)) | 289 | (goto-char (match-end 0)) |
| 288 | 290 | ||
| 291 | ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize | ||
| 292 | |||
| 289 | ;; Do we have a string between quotes (or double-quotes), | 293 | ;; Do we have a string between quotes (or double-quotes), |
| 290 | ;; or a simple word ? | 294 | ;; or a simple word ? |
| 291 | (unless (looking-at "\"\\([^\"]*\\)\"") | 295 | (if (looking-at "\"\\([^\"]*\\)\"") |
| 292 | (unless (looking-at "'\\([^']*\\)'") | 296 | (setq start-pos (match-beginning 0)) |
| 297 | (if (looking-at "'\\([^']*\\)") | ||
| 298 | (setq start-pos (match-beginning 0)) | ||
| 293 | (error "XML: Attribute values must be given between quotes"))) | 299 | (error "XML: Attribute values must be given between quotes"))) |
| 294 | 300 | ||
| 295 | ;; Each attribute must be unique within a given element | 301 | ;; Each attribute must be unique within a given element |
| 296 | (if (assoc name attlist) | 302 | (if (assoc name attlist) |
| 297 | (error "XML: each attribute must be unique within an element")) | 303 | (error "XML: each attribute must be unique within an element")) |
| 298 | 304 | ||
| 299 | (push (cons name (match-string-no-properties 1)) attlist) | 305 | ;; Multiple whitespace characters should be replaced with a single one |
| 300 | (goto-char (match-end 0)) | 306 | ;; in the attributes |
| 301 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 307 | (let ((string (match-string-no-properties 1)) |
| 308 | (pos 0)) | ||
| 309 | (while (string-match "[ \t\n\r]+" string pos) | ||
| 310 | (setq string (replace-match " " t nil string)) | ||
| 311 | (setq pos (1+ (match-beginning 0)))) | ||
| 312 | (push (cons name (xml-substitute-special string)) attlist)) | ||
| 313 | |||
| 314 | (goto-char start-pos) | ||
| 315 | (if (looking-at "\"\\([^\"]*\\)\"") | ||
| 316 | (goto-char (match-end 0)) | ||
| 317 | (if (looking-at "'\\([^']*\\)") | ||
| 318 | (goto-char (match-end 0)))) | ||
| 319 | |||
| 320 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) | ||
| 302 | (if (> (point) end) | 321 | (if (> (point) end) |
| 303 | (error "XML: end of attribute list not found before end of region")) | 322 | (error "XML: end of attribute list not found before end of region"))) |
| 304 | ) | ||
| 305 | (nreverse attlist))) | 323 | (nreverse attlist))) |
| 306 | 324 | ||
| 307 | ;;******************************************************************* | 325 | ;;******************************************************************* |
| @@ -318,15 +336,15 @@ The DTD must end before the position END in the current buffer. | |||
| 318 | The point must be just before the starting tag of the DTD. | 336 | The point must be just before the starting tag of the DTD. |
| 319 | This follows the rule [28] in the XML specifications." | 337 | This follows the rule [28] in the XML specifications." |
| 320 | (forward-char (length "<!DOCTYPE")) | 338 | (forward-char (length "<!DOCTYPE")) |
| 321 | (if (looking-at "[[:space:]]*>") | 339 | (if (looking-at "[ \t\n\r]*>") |
| 322 | (error "XML: invalid DTD (excepting name of the document)")) | 340 | (error "XML: invalid DTD (excepting name of the document)")) |
| 323 | (condition-case nil | 341 | (condition-case nil |
| 324 | (progn | 342 | (progn |
| 325 | (forward-word 1) ;; name of the document | 343 | (forward-word 1) |
| 326 | (goto-char (- (re-search-forward "[[:space:]]") 1)) | 344 | (goto-char (- (re-search-forward "[ \t\n\r]") 1)) |
| 327 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 345 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) |
| 328 | (if (looking-at "\\[") | 346 | (if (looking-at "\\[") |
| 329 | (re-search-forward "\\][[:space:]]*>" end) | 347 | (re-search-forward "\\][ \t\n\r]*>" end) |
| 330 | (search-forward ">" end))) | 348 | (search-forward ">" end))) |
| 331 | (error (error "XML: No end to the DTD")))) | 349 | (error (error "XML: No end to the DTD")))) |
| 332 | 350 | ||
| @@ -334,7 +352,7 @@ This follows the rule [28] in the XML specifications." | |||
| 334 | "Parse the DTD that point is looking at. | 352 | "Parse the DTD that point is looking at. |
| 335 | The DTD must end before the position END in the current buffer." | 353 | The DTD must end before the position END in the current buffer." |
| 336 | (forward-char (length "<!DOCTYPE")) | 354 | (forward-char (length "<!DOCTYPE")) |
| 337 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 355 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) |
| 338 | (if (looking-at ">") | 356 | (if (looking-at ">") |
| 339 | (error "XML: invalid DTD (excepting name of the document)")) | 357 | (error "XML: invalid DTD (excepting name of the document)")) |
| 340 | 358 | ||
| @@ -344,24 +362,24 @@ The DTD must end before the position END in the current buffer." | |||
| 344 | type element end-pos) | 362 | type element end-pos) |
| 345 | (goto-char (match-end 0)) | 363 | (goto-char (match-end 0)) |
| 346 | 364 | ||
| 347 | (goto-char (- (re-search-forward "[^[:space:]]") 1)) | 365 | (goto-char (- (re-search-forward "[^ \t\n\r]") 1)) |
| 348 | 366 | ||
| 349 | ;; External DTDs => don't know how to handle them yet | 367 | ;; External DTDs => don't know how to handle them yet |
| 350 | (if (looking-at "SYSTEM") | 368 | (if (looking-at "SYSTEM") |
| 351 | (error "XML: Don't know how to handle external DTDs")) | 369 | (error "XML: Don't know how to handle external DTDs")) |
| 352 | 370 | ||
| 353 | (if (not (= (char-after) ?\[)) | 371 | (if (not (= (char-after) ?\[)) |
| 354 | (error "XML: Unknown declaration in the DTD")) | 372 | (error "XML: Unknown declaration in the DTD")) |
| 355 | 373 | ||
| 356 | ;; Parse the rest of the DTD | 374 | ;; Parse the rest of the DTD |
| 357 | (forward-char 1) | 375 | (forward-char 1) |
| 358 | (while (and (not (looking-at "[[:space:]]*\\]")) | 376 | (while (and (not (looking-at "[ \t\n\r]*\\]")) |
| 359 | (<= (point) end)) | 377 | (<= (point) end)) |
| 360 | (cond | 378 | (cond |
| 361 | 379 | ||
| 362 | ;; Translation of rule [45] of XML specifications | 380 | ;; Translation of rule [45] of XML specifications |
| 363 | ((looking-at | 381 | ((looking-at |
| 364 | "[[:space:]]*<!ELEMENT[[:space:]]+\\([a-zA-Z0-9.%;]+\\)[[:space:]]+\\([^>]+\\)>") | 382 | "[ \t\n\r]*<!ELEMENT[ \t\n\r]+\\([a-zA-Z0-9.%;]+\\)[ \t\n\r]+\\([^>]+\\)>") |
| 365 | 383 | ||
| 366 | (setq element (intern (match-string-no-properties 1)) | 384 | (setq element (intern (match-string-no-properties 1)) |
| 367 | type (match-string-no-properties 2)) | 385 | type (match-string-no-properties 2)) |
| @@ -369,13 +387,13 @@ The DTD must end before the position END in the current buffer." | |||
| 369 | 387 | ||
| 370 | ;; Translation of rule [46] of XML specifications | 388 | ;; Translation of rule [46] of XML specifications |
| 371 | (cond | 389 | (cond |
| 372 | ((string-match "^EMPTY[[:space:]]*$" type) ;; empty declaration | 390 | ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration |
| 373 | (setq type 'empty)) | 391 | (setq type 'empty)) |
| 374 | ((string-match "^ANY[[:space:]]*$" type) ;; any type of contents | 392 | ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents |
| 375 | (setq type 'any)) | 393 | (setq type 'any)) |
| 376 | ((string-match "^(\\(.*\\))[[:space:]]*$" type) ;; children ([47]) | 394 | ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47]) |
| 377 | (setq type (xml-parse-elem-type (match-string-no-properties 1 type)))) | 395 | (setq type (xml-parse-elem-type (match-string-no-properties 1 type)))) |
| 378 | ((string-match "^%[^;]+;[[:space:]]*$" type) ;; substitution | 396 | ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution |
| 379 | nil) | 397 | nil) |
| 380 | (t | 398 | (t |
| 381 | (error "XML: Invalid element type in the DTD"))) | 399 | (error "XML: Invalid element type in the DTD"))) |
| @@ -417,8 +435,8 @@ The DTD must end before the position END in the current buffer." | |||
| 417 | (mapcar 'xml-parse-elem-type | 435 | (mapcar 'xml-parse-elem-type |
| 418 | (split-string elem ",")))) | 436 | (split-string elem ",")))) |
| 419 | ))) | 437 | ))) |
| 420 | (if (string-match "[[:space:]]*\\([^+*?]+\\)\\([+*?]?\\)" string) | 438 | (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string) |
| 421 | (setq elem (match-string 1 string) | 439 | (setq elem (match-string 1 string) |
| 422 | modifier (match-string 2 string)))) | 440 | modifier (match-string 2 string)))) |
| 423 | 441 | ||
| 424 | (if (and (stringp elem) (string= elem "#PCDATA")) | 442 | (if (and (stringp elem) (string= elem "#PCDATA")) |
| @@ -434,6 +452,22 @@ The DTD must end before the position END in the current buffer." | |||
| 434 | (t | 452 | (t |
| 435 | elem)))) | 453 | elem)))) |
| 436 | 454 | ||
| 455 | ;;******************************************************************* | ||
| 456 | ;;** | ||
| 457 | ;;** Converting code points to strings | ||
| 458 | ;;** | ||
| 459 | ;;******************************************************************* | ||
| 460 | |||
| 461 | (defun xml-ucs-to-string (codepoint) | ||
| 462 | "Return a string representation of CODEPOINT. If it can't be | ||
| 463 | converted, return '?'." | ||
| 464 | (cond ((boundp 'decode-char) | ||
| 465 | (char-to-string (decode-char 'ucs codepoint))) | ||
| 466 | ((and (< codepoint 128) | ||
| 467 | (> codepoint 31)) | ||
| 468 | (char-to-string codepoint)) | ||
| 469 | (t "?"))) ; FIXME: There's gotta be a better way to | ||
| 470 | ; designate an unknown character. | ||
| 437 | 471 | ||
| 438 | ;;******************************************************************* | 472 | ;;******************************************************************* |
| 439 | ;;** | 473 | ;;** |
| @@ -451,6 +485,18 @@ The DTD must end before the position END in the current buffer." | |||
| 451 | (setq string (replace-match "'" t nil string))) | 485 | (setq string (replace-match "'" t nil string))) |
| 452 | (while (string-match """ string) | 486 | (while (string-match """ string) |
| 453 | (setq string (replace-match "\"" t nil string))) | 487 | (setq string (replace-match "\"" t nil string))) |
| 488 | (while (string-match "&#\\([0-9]+\\);" string) | ||
| 489 | (setq string (replace-match (xml-ucs-to-string | ||
| 490 | (string-to-number | ||
| 491 | (match-string-no-properties 1 string))) | ||
| 492 | t nil string))) | ||
| 493 | (while (string-match "&#x\\([0-9a-fA-F]+\\);" string) | ||
| 494 | (setq string (replace-match (xml-ucs-to-string | ||
| 495 | (string-to-number | ||
| 496 | (match-string-no-properties 1 string) | ||
| 497 | 16)) | ||
| 498 | t nil string))) | ||
| 499 | |||
| 454 | ;; This goes last so it doesn't confuse the matches above. | 500 | ;; This goes last so it doesn't confuse the matches above. |
| 455 | (while (string-match "&" string) | 501 | (while (string-match "&" string) |
| 456 | (setq string (replace-match "&" t nil string))) | 502 | (setq string (replace-match "&" t nil string))) |