diff options
| author | Lars Ingebrigtsen | 2019-09-14 16:07:34 +0200 |
|---|---|---|
| committer | Lars Ingebrigtsen | 2019-09-14 16:10:23 +0200 |
| commit | 568f1488a69e8cb0961571ff8f158df8891c3c44 (patch) | |
| tree | e83e7e54897400293078cd018d0f71ddee3e8165 | |
| parent | 49a4b86925f1338268a2e79d0ef164a3cb368ec2 (diff) | |
| download | emacs-568f1488a69e8cb0961571ff8f158df8891c3c44.tar.gz emacs-568f1488a69e8cb0961571ff8f158df8891c3c44.zip | |
Make eww more liberal when interpreting some invalid HTML
* lisp/net/eww.el (eww--preprocess-html): New function (bug#37009)
to be more lenient with invalid HTML and translate common invalid
HTML like "a <= b" into "a <= b" to be more liberal in what we
accept before parsing.
(eww-display-html): Use it.
(eww-readable): Ditto.
| -rw-r--r-- | lisp/net/eww.el | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/lisp/net/eww.el b/lisp/net/eww.el index 77e6cec9b04..2013604c9e7 100644 --- a/lisp/net/eww.el +++ b/lisp/net/eww.el | |||
| @@ -326,6 +326,18 @@ the default EWW buffer." | |||
| 326 | #'url-hexify-string (split-string url) "+")))))) | 326 | #'url-hexify-string (split-string url) "+")))))) |
| 327 | url) | 327 | url) |
| 328 | 328 | ||
| 329 | (defun eww--preprocess-html (start end) | ||
| 330 | "Translate all < characters that do not look like start of tags into <." | ||
| 331 | (save-excursion | ||
| 332 | (save-restriction | ||
| 333 | (narrow-to-region start end) | ||
| 334 | (goto-char start) | ||
| 335 | (let ((case-fold-search t)) | ||
| 336 | (while (re-search-forward "<[^0-9a-z!/]" nil t) | ||
| 337 | (goto-char (match-beginning 0)) | ||
| 338 | (delete-region (point) (1+ (point))) | ||
| 339 | (insert "<")))))) | ||
| 340 | |||
| 329 | ;;;###autoload (defalias 'browse-web 'eww) | 341 | ;;;###autoload (defalias 'browse-web 'eww) |
| 330 | 342 | ||
| 331 | ;;;###autoload | 343 | ;;;###autoload |
| @@ -479,6 +491,7 @@ Currently this means either text/html or application/xhtml+xml." | |||
| 479 | ;; Remove CRLF and replace NUL with � before parsing. | 491 | ;; Remove CRLF and replace NUL with � before parsing. |
| 480 | (while (re-search-forward "\\(\r$\\)\\|\0" nil t) | 492 | (while (re-search-forward "\\(\r$\\)\\|\0" nil t) |
| 481 | (replace-match (if (match-beginning 1) "" "�") t t))) | 493 | (replace-match (if (match-beginning 1) "" "�") t t))) |
| 494 | (eww--preprocess-html (point) (point-max)) | ||
| 482 | (libxml-parse-html-region (point) (point-max)))))) | 495 | (libxml-parse-html-region (point) (point-max)))))) |
| 483 | (source (and (null document) | 496 | (source (and (null document) |
| 484 | (buffer-substring (point) (point-max))))) | 497 | (buffer-substring (point) (point-max))))) |
| @@ -716,6 +729,7 @@ the like." | |||
| 716 | (condition-case nil | 729 | (condition-case nil |
| 717 | (decode-coding-region (point-min) (point-max) 'utf-8) | 730 | (decode-coding-region (point-min) (point-max) 'utf-8) |
| 718 | (coding-system-error nil)) | 731 | (coding-system-error nil)) |
| 732 | (eww--preprocess-html (point-min) (point-max)) | ||
| 719 | (libxml-parse-html-region (point-min) (point-max)))) | 733 | (libxml-parse-html-region (point-min) (point-max)))) |
| 720 | (base (plist-get eww-data :url))) | 734 | (base (plist-get eww-data :url))) |
| 721 | (eww-score-readability dom) | 735 | (eww-score-readability dom) |