aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJuanma Barranquero2003-03-16 10:48:34 +0000
committerJuanma Barranquero2003-03-16 10:48:34 +0000
commita158ff811faadd2891b3db1f9b204388ac640c60 (patch)
treebf01f678fa65d63a8428aaa51661a5ea7b33fe70
parent906cbe4568735cd8184b72399588c13918111346 (diff)
downloademacs-a158ff811faadd2891b3db1f9b204388ac640c60.tar.gz
emacs-a158ff811faadd2891b3db1f9b204388ac640c60.zip
(xml-ucs-to-string): New function to convert Unicode codepoints to strings.
Uses decode-char (mule.el) if available. (xml-parse-tag, xml-parse-attlist, xml-skip-dtd, xml-parse-dtd, xml-parse-elem-type): Use ' \t\n\r' instead of '[:space:]'. (xml-parse-attlist): Added attribute normalization. (xml-parse-tag): Replace "\r\n" and "\r" with "\n".
-rw-r--r--lisp/ChangeLog9
-rw-r--r--lisp/xml.el122
2 files changed, 93 insertions, 38 deletions
diff --git a/lisp/ChangeLog b/lisp/ChangeLog
index 91dafdfbcd4..895ef1bd5bb 100644
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@@ -1,3 +1,12 @@
12003-03-15 Mark A. Hershberger <mah@everybody.org>
2
3 * xml.el (xml-ucs-to-string): New function to convert Unicode
4 codepoints to strings. Uses decode-char (mule.el) if available.
5 (xml-parse-tag, xml-parse-attlist, xml-skip-dtd, xml-parse-dtd)
6 (xml-parse-elem-type): Use ' \t\n\r' instead of '[:space:]'.
7 (xml-parse-attlist): Added attribute normalization.
8 (xml-parse-tag): Replace "\r\n" and "\r" with "\n".
9
12003-03-14 John Paul Wallington <jpw@gnu.org> 102003-03-14 John Paul Wallington <jpw@gnu.org>
2 11
3 * files.el (recover-session): Error if there are no previous 12 * files.el (recover-session): Error if there are no previous
diff --git a/lisp/xml.el b/lisp/xml.el
index d6a0bc74b45..fc6365b50a0 100644
--- a/lisp/xml.el
+++ b/lisp/xml.el
@@ -184,7 +184,7 @@ Returns one of:
184 ;; beginning of a document) 184 ;; beginning of a document)
185 ((looking-at "<\\?") 185 ((looking-at "<\\?")
186 (search-forward "?>" end) 186 (search-forward "?>" end)
187 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 187 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
188 (xml-parse-tag end)) 188 (xml-parse-tag end))
189 ;; Character data (CDATA) sections, in which no tag should be interpreted 189 ;; Character data (CDATA) sections, in which no tag should be interpreted
190 ((looking-at "<!\\[CDATA\\[") 190 ((looking-at "<!\\[CDATA\\[")
@@ -198,7 +198,7 @@ Returns one of:
198 (if parse-dtd 198 (if parse-dtd
199 (setq dtd (xml-parse-dtd end)) 199 (setq dtd (xml-parse-dtd end))
200 (xml-skip-dtd end)) 200 (xml-skip-dtd end))
201 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 201 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
202 (if dtd 202 (if dtd
203 (cons dtd (xml-parse-tag end)) 203 (cons dtd (xml-parse-tag end))
204 (xml-parse-tag end)))) 204 (xml-parse-tag end))))
@@ -210,7 +210,7 @@ Returns one of:
210 ((looking-at "</") 210 ((looking-at "</")
211 '()) 211 '())
212 ;; opening tag 212 ;; opening tag
213 ((looking-at "<\\([^/>[:space:]]+\\)") 213 ((looking-at "<\\([^/> \t\n\r]+\\)")
214 (goto-char (match-end 1)) 214 (goto-char (match-end 1))
215 (let* ((case-fold-search nil) ;; XML is case-sensitive. 215 (let* ((case-fold-search nil) ;; XML is case-sensitive.
216 (node-name (match-string 1)) 216 (node-name (match-string 1))
@@ -219,7 +219,7 @@ Returns one of:
219 pos) 219 pos)
220 220
221 ;; is this an empty element ? 221 ;; is this an empty element ?
222 (if (looking-at "/[[:space:]]*>") 222 (if (looking-at "/[ \t\n\r]*>")
223 (progn 223 (progn
224 (forward-char 2) 224 (forward-char 2)
225 (nreverse (cons '("") children))) 225 (nreverse (cons '("") children)))
@@ -230,7 +230,7 @@ Returns one of:
230 (forward-char 1) 230 (forward-char 1)
231 ;; Now check that we have the right end-tag. Note that this 231 ;; Now check that we have the right end-tag. Note that this
232 ;; one might contain spaces after the tag name 232 ;; one might contain spaces after the tag name
233 (while (not (looking-at (concat "</" node-name "[[:space:]]*>"))) 233 (while (not (looking-at (concat "</" node-name "[ \t\n\r]*>")))
234 (cond 234 (cond
235 ((looking-at "</") 235 ((looking-at "</")
236 (error (concat 236 (error (concat
@@ -248,12 +248,14 @@ Returns one of:
248 (let ((string (buffer-substring-no-properties pos (point))) 248 (let ((string (buffer-substring-no-properties pos (point)))
249 (pos 0)) 249 (pos 0))
250 250
251 ;; Clean up the string (no newline characters) 251 ;; Clean up the string. As per XML
252 ;; Not done, since as per XML specifications, the XML processor 252 ;; specifications, the XML processor should
253 ;; should always pass the whole string to the application. 253 ;; always pass the whole string to the
254 ;; (while (string-match "\\s +" string pos) 254 ;; application. But \r's should be replaced:
255 ;; (setq string (replace-match " " t t string)) 255 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
256 ;; (setq pos (1+ (match-beginning 0)))) 256 (while (string-match "\r\n?" string pos)
257 (setq string (replace-match "\n" t t string))
258 (setq pos (1+ (match-beginning 0))))
257 259
258 (setq string (xml-substitute-special string)) 260 (setq string (xml-substitute-special string))
259 (setq children 261 (setq children
@@ -280,28 +282,44 @@ Returns one of:
280The search for attributes end at the position END in the current buffer. 282The search for attributes end at the position END in the current buffer.
281Leaves the point on the first non-blank character after the tag." 283Leaves the point on the first non-blank character after the tag."
282 (let ((attlist ()) 284 (let ((attlist ())
283 name) 285 start-pos name)
284 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 286 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
285 (while (looking-at "\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[[:space:]]*=[[:space:]]*") 287 (while (looking-at "\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[ \t\n\r]*=[ \t\n\r]*")
286 (setq name (intern (match-string 1))) 288 (setq name (intern (match-string 1)))
287 (goto-char (match-end 0)) 289 (goto-char (match-end 0))
288 290
291 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
292
289 ;; Do we have a string between quotes (or double-quotes), 293 ;; Do we have a string between quotes (or double-quotes),
290 ;; or a simple word ? 294 ;; or a simple word ?
291 (unless (looking-at "\"\\([^\"]*\\)\"") 295 (if (looking-at "\"\\([^\"]*\\)\"")
292 (unless (looking-at "'\\([^']*\\)'") 296 (setq start-pos (match-beginning 0))
297 (if (looking-at "'\\([^']*\\)")
298 (setq start-pos (match-beginning 0))
293 (error "XML: Attribute values must be given between quotes"))) 299 (error "XML: Attribute values must be given between quotes")))
294 300
295 ;; Each attribute must be unique within a given element 301 ;; Each attribute must be unique within a given element
296 (if (assoc name attlist) 302 (if (assoc name attlist)
297 (error "XML: each attribute must be unique within an element")) 303 (error "XML: each attribute must be unique within an element"))
298 304
299 (push (cons name (match-string-no-properties 1)) attlist) 305 ;; Multiple whitespace characters should be replaced with a single one
300 (goto-char (match-end 0)) 306 ;; in the attributes
301 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 307 (let ((string (match-string-no-properties 1))
308 (pos 0))
309 (while (string-match "[ \t\n\r]+" string pos)
310 (setq string (replace-match " " t nil string))
311 (setq pos (1+ (match-beginning 0))))
312 (push (cons name (xml-substitute-special string)) attlist))
313
314 (goto-char start-pos)
315 (if (looking-at "\"\\([^\"]*\\)\"")
316 (goto-char (match-end 0))
317 (if (looking-at "'\\([^']*\\)")
318 (goto-char (match-end 0))))
319
320 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
302 (if (> (point) end) 321 (if (> (point) end)
303 (error "XML: end of attribute list not found before end of region")) 322 (error "XML: end of attribute list not found before end of region")))
304 )
305 (nreverse attlist))) 323 (nreverse attlist)))
306 324
307;;******************************************************************* 325;;*******************************************************************
@@ -318,15 +336,15 @@ The DTD must end before the position END in the current buffer.
318The point must be just before the starting tag of the DTD. 336The point must be just before the starting tag of the DTD.
319This follows the rule [28] in the XML specifications." 337This follows the rule [28] in the XML specifications."
320 (forward-char (length "<!DOCTYPE")) 338 (forward-char (length "<!DOCTYPE"))
321 (if (looking-at "[[:space:]]*>") 339 (if (looking-at "[ \t\n\r]*>")
322 (error "XML: invalid DTD (excepting name of the document)")) 340 (error "XML: invalid DTD (excepting name of the document)"))
323 (condition-case nil 341 (condition-case nil
324 (progn 342 (progn
325 (forward-word 1) ;; name of the document 343 (forward-word 1)
326 (goto-char (- (re-search-forward "[[:space:]]") 1)) 344 (goto-char (- (re-search-forward "[ \t\n\r]") 1))
327 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 345 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
328 (if (looking-at "\\[") 346 (if (looking-at "\\[")
329 (re-search-forward "\\][[:space:]]*>" end) 347 (re-search-forward "\\][ \t\n\r]*>" end)
330 (search-forward ">" end))) 348 (search-forward ">" end)))
331 (error (error "XML: No end to the DTD")))) 349 (error (error "XML: No end to the DTD"))))
332 350
@@ -334,7 +352,7 @@ This follows the rule [28] in the XML specifications."
334 "Parse the DTD that point is looking at. 352 "Parse the DTD that point is looking at.
335The DTD must end before the position END in the current buffer." 353The DTD must end before the position END in the current buffer."
336 (forward-char (length "<!DOCTYPE")) 354 (forward-char (length "<!DOCTYPE"))
337 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 355 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
338 (if (looking-at ">") 356 (if (looking-at ">")
339 (error "XML: invalid DTD (excepting name of the document)")) 357 (error "XML: invalid DTD (excepting name of the document)"))
340 358
@@ -344,24 +362,24 @@ The DTD must end before the position END in the current buffer."
344 type element end-pos) 362 type element end-pos)
345 (goto-char (match-end 0)) 363 (goto-char (match-end 0))
346 364
347 (goto-char (- (re-search-forward "[^[:space:]]") 1)) 365 (goto-char (- (re-search-forward "[^ \t\n\r]") 1))
348 366
349 ;; External DTDs => don't know how to handle them yet 367 ;; External DTDs => don't know how to handle them yet
350 (if (looking-at "SYSTEM") 368 (if (looking-at "SYSTEM")
351 (error "XML: Don't know how to handle external DTDs")) 369 (error "XML: Don't know how to handle external DTDs"))
352 370
353 (if (not (= (char-after) ?\[)) 371 (if (not (= (char-after) ?\[))
354 (error "XML: Unknown declaration in the DTD")) 372 (error "XML: Unknown declaration in the DTD"))
355 373
356 ;; Parse the rest of the DTD 374 ;; Parse the rest of the DTD
357 (forward-char 1) 375 (forward-char 1)
358 (while (and (not (looking-at "[[:space:]]*\\]")) 376 (while (and (not (looking-at "[ \t\n\r]*\\]"))
359 (<= (point) end)) 377 (<= (point) end))
360 (cond 378 (cond
361 379
362 ;; Translation of rule [45] of XML specifications 380 ;; Translation of rule [45] of XML specifications
363 ((looking-at 381 ((looking-at
364 "[[:space:]]*<!ELEMENT[[:space:]]+\\([a-zA-Z0-9.%;]+\\)[[:space:]]+\\([^>]+\\)>") 382 "[ \t\n\r]*<!ELEMENT[ \t\n\r]+\\([a-zA-Z0-9.%;]+\\)[ \t\n\r]+\\([^>]+\\)>")
365 383
366 (setq element (intern (match-string-no-properties 1)) 384 (setq element (intern (match-string-no-properties 1))
367 type (match-string-no-properties 2)) 385 type (match-string-no-properties 2))
@@ -369,13 +387,13 @@ The DTD must end before the position END in the current buffer."
369 387
370 ;; Translation of rule [46] of XML specifications 388 ;; Translation of rule [46] of XML specifications
371 (cond 389 (cond
372 ((string-match "^EMPTY[[:space:]]*$" type) ;; empty declaration 390 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
373 (setq type 'empty)) 391 (setq type 'empty))
374 ((string-match "^ANY[[:space:]]*$" type) ;; any type of contents 392 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
375 (setq type 'any)) 393 (setq type 'any))
376 ((string-match "^(\\(.*\\))[[:space:]]*$" type) ;; children ([47]) 394 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
377 (setq type (xml-parse-elem-type (match-string-no-properties 1 type)))) 395 (setq type (xml-parse-elem-type (match-string-no-properties 1 type))))
378 ((string-match "^%[^;]+;[[:space:]]*$" type) ;; substitution 396 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
379 nil) 397 nil)
380 (t 398 (t
381 (error "XML: Invalid element type in the DTD"))) 399 (error "XML: Invalid element type in the DTD")))
@@ -417,8 +435,8 @@ The DTD must end before the position END in the current buffer."
417 (mapcar 'xml-parse-elem-type 435 (mapcar 'xml-parse-elem-type
418 (split-string elem ",")))) 436 (split-string elem ","))))
419 ))) 437 )))
420 (if (string-match "[[:space:]]*\\([^+*?]+\\)\\([+*?]?\\)" string) 438 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
421 (setq elem (match-string 1 string) 439 (setq elem (match-string 1 string)
422 modifier (match-string 2 string)))) 440 modifier (match-string 2 string))))
423 441
424 (if (and (stringp elem) (string= elem "#PCDATA")) 442 (if (and (stringp elem) (string= elem "#PCDATA"))
@@ -434,6 +452,22 @@ The DTD must end before the position END in the current buffer."
434 (t 452 (t
435 elem)))) 453 elem))))
436 454
455;;*******************************************************************
456;;**
457;;** Converting code points to strings
458;;**
459;;*******************************************************************
460
461(defun xml-ucs-to-string (codepoint)
462 "Return a string representation of CODEPOINT. If it can't be
463converted, return '?'."
464 (cond ((boundp 'decode-char)
465 (char-to-string (decode-char 'ucs codepoint)))
466 ((and (< codepoint 128)
467 (> codepoint 31))
468 (char-to-string codepoint))
469 (t "?"))) ; FIXME: There's gotta be a better way to
470 ; designate an unknown character.
437 471
438;;******************************************************************* 472;;*******************************************************************
439;;** 473;;**
@@ -451,6 +485,18 @@ The DTD must end before the position END in the current buffer."
451 (setq string (replace-match "'" t nil string))) 485 (setq string (replace-match "'" t nil string)))
452 (while (string-match "&quot;" string) 486 (while (string-match "&quot;" string)
453 (setq string (replace-match "\"" t nil string))) 487 (setq string (replace-match "\"" t nil string)))
488 (while (string-match "&#\\([0-9]+\\);" string)
489 (setq string (replace-match (xml-ucs-to-string
490 (string-to-number
491 (match-string-no-properties 1 string)))
492 t nil string)))
493 (while (string-match "&#x\\([0-9a-fA-F]+\\);" string)
494 (setq string (replace-match (xml-ucs-to-string
495 (string-to-number
496 (match-string-no-properties 1 string)
497 16))
498 t nil string)))
499
454 ;; This goes last so it doesn't confuse the matches above. 500 ;; This goes last so it doesn't confuse the matches above.
455 (while (string-match "&amp;" string) 501 (while (string-match "&amp;" string)
456 (setq string (replace-match "&" t nil string))) 502 (setq string (replace-match "&" t nil string)))