aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Ingebrigtsen2019-09-14 16:07:34 +0200
committerLars Ingebrigtsen2019-09-14 16:10:23 +0200
commit568f1488a69e8cb0961571ff8f158df8891c3c44 (patch)
treee83e7e54897400293078cd018d0f71ddee3e8165
parent49a4b86925f1338268a2e79d0ef164a3cb368ec2 (diff)
downloademacs-568f1488a69e8cb0961571ff8f158df8891c3c44.tar.gz
emacs-568f1488a69e8cb0961571ff8f158df8891c3c44.zip
Make eww more liberal when interpreting some invalid HTML
* lisp/net/eww.el (eww--preprocess-html): New function (bug#37009) to be more lenient with invalid HTML and translate common invalid HTML like "a <= b" into "a &lt;= b" to be more liberal in what we accept before parsing. (eww-display-html): Use it. (eww-readable): Ditto.
-rw-r--r--lisp/net/eww.el14
1 files changed, 14 insertions, 0 deletions
diff --git a/lisp/net/eww.el b/lisp/net/eww.el
index 77e6cec9b04..2013604c9e7 100644
--- a/lisp/net/eww.el
+++ b/lisp/net/eww.el
@@ -326,6 +326,18 @@ the default EWW buffer."
326 #'url-hexify-string (split-string url) "+")))))) 326 #'url-hexify-string (split-string url) "+"))))))
327 url) 327 url)
328 328
329(defun eww--preprocess-html (start end)
330 "Translate all < characters that do not look like start of tags into &lt;."
331 (save-excursion
332 (save-restriction
333 (narrow-to-region start end)
334 (goto-char start)
335 (let ((case-fold-search t))
336 (while (re-search-forward "<[^0-9a-z!/]" nil t)
337 (goto-char (match-beginning 0))
338 (delete-region (point) (1+ (point)))
339 (insert "&lt;"))))))
340
329;;;###autoload (defalias 'browse-web 'eww) 341;;;###autoload (defalias 'browse-web 'eww)
330 342
331;;;###autoload 343;;;###autoload
@@ -479,6 +491,7 @@ Currently this means either text/html or application/xhtml+xml."
479 ;; Remove CRLF and replace NUL with &#0; before parsing. 491 ;; Remove CRLF and replace NUL with &#0; before parsing.
480 (while (re-search-forward "\\(\r$\\)\\|\0" nil t) 492 (while (re-search-forward "\\(\r$\\)\\|\0" nil t)
481 (replace-match (if (match-beginning 1) "" "&#0;") t t))) 493 (replace-match (if (match-beginning 1) "" "&#0;") t t)))
494 (eww--preprocess-html (point) (point-max))
482 (libxml-parse-html-region (point) (point-max)))))) 495 (libxml-parse-html-region (point) (point-max))))))
483 (source (and (null document) 496 (source (and (null document)
484 (buffer-substring (point) (point-max))))) 497 (buffer-substring (point) (point-max)))))
@@ -716,6 +729,7 @@ the like."
716 (condition-case nil 729 (condition-case nil
717 (decode-coding-region (point-min) (point-max) 'utf-8) 730 (decode-coding-region (point-min) (point-max) 'utf-8)
718 (coding-system-error nil)) 731 (coding-system-error nil))
732 (eww--preprocess-html (point-min) (point-max))
719 (libxml-parse-html-region (point-min) (point-max)))) 733 (libxml-parse-html-region (point-min) (point-max))))
720 (base (plist-get eww-data :url))) 734 (base (plist-get eww-data :url)))
721 (eww-score-readability dom) 735 (eww-score-readability dom)