diff options
| author | Jim Porter | 2025-06-17 09:11:55 -0700 |
|---|---|---|
| committer | Jim Porter | 2025-06-25 21:26:04 -0700 |
| commit | 2bdcf0250acecdb0719203ae711aedf5baad1783 (patch) | |
| tree | 542289c62c26eab813388a761f823f3f783c6e7a | |
| parent | faae9f572ab3027c46800575a12a72c8c0eee7c1 (diff) | |
| download | emacs-2bdcf0250acecdb0719203ae711aedf5baad1783.tar.gz emacs-2bdcf0250acecdb0719203ae711aedf5baad1783.zip | |
When making a readable page in EWW, include the <title> and similar tags
* lisp/net/eww.el (eww--walk-readability, eww-readable-dom): New
functions.
(eww-display-html): Call 'eww-readable-dom'.
(eww-readable): Call 'eww-readable-dom'. Don't copy over 'eww-data'
properties that our new readable page can handle on its own.
(eww-score-readability): Rewrite in terms of 'eww--walk-readability'.
Make obsolete.
(eww-highest-readability): Make obsolete.
* test/lisp/net/eww-tests.el (eww-test--lots-of-words)
(eww-test--wordy-page): New variables...
(eww-test/readable/toggle-display): ... use them.
(eww-test/readable/default-readable): Make sure that the readable page
includes the <title> and <link> tags (bug#77299).
| -rw-r--r-- | lisp/net/eww.el | 106 | ||||
| -rw-r--r-- | test/lisp/net/eww-tests.el | 35 |
2 files changed, 102 insertions, 39 deletions
diff --git a/lisp/net/eww.el b/lisp/net/eww.el index c70e4f40ee4..f27b66c5dd5 100644 --- a/lisp/net/eww.el +++ b/lisp/net/eww.el | |||
| @@ -863,8 +863,7 @@ This replaces the region with the preprocessed HTML." | |||
| 863 | (unless document | 863 | (unless document |
| 864 | (let ((dom (eww--parse-html-region (point) (point-max) charset))) | 864 | (let ((dom (eww--parse-html-region (point) (point-max) charset))) |
| 865 | (when (eww-default-readable-p url) | 865 | (when (eww-default-readable-p url) |
| 866 | (eww-score-readability dom) | 866 | (setq dom (eww-readable-dom dom)) |
| 867 | (setq dom (eww-highest-readability dom)) | ||
| 868 | (with-current-buffer buffer | 867 | (with-current-buffer buffer |
| 869 | (plist-put eww-data :readable t))) | 868 | (plist-put eww-data :readable t))) |
| 870 | (setq document (eww-document-base url dom)))) | 869 | (setq document (eww-document-base url dom)))) |
| @@ -1163,42 +1162,97 @@ adds a new entry to `eww-history'." | |||
| 1163 | (eww--parse-html-region (point-min) (point-max)))) | 1162 | (eww--parse-html-region (point-min) (point-max)))) |
| 1164 | (base (plist-get eww-data :url))) | 1163 | (base (plist-get eww-data :url))) |
| 1165 | (when make-readable | 1164 | (when make-readable |
| 1166 | (eww-score-readability dom) | 1165 | (setq dom (eww-readable-dom dom))) |
| 1167 | (setq dom (eww-highest-readability dom))) | ||
| 1168 | (when eww-readable-adds-to-history | 1166 | (when eww-readable-adds-to-history |
| 1169 | (eww-save-history) | 1167 | (eww-save-history) |
| 1170 | (eww--before-browse) | 1168 | (eww--before-browse) |
| 1171 | (dolist (elem '(:source :url :title :next :previous :up :peer)) | 1169 | (dolist (elem '(:source :url :peer)) |
| 1172 | (plist-put eww-data elem (plist-get old-data elem)))) | 1170 | (plist-put eww-data elem (plist-get old-data elem)))) |
| 1173 | (eww-display-document (eww-document-base base dom)) | 1171 | (eww-display-document (eww-document-base base dom)) |
| 1174 | (plist-put eww-data :readable make-readable) | 1172 | (plist-put eww-data :readable make-readable) |
| 1175 | (eww--after-page-change))) | 1173 | (eww--after-page-change))) |
| 1176 | 1174 | ||
| 1177 | (defun eww-score-readability (node) | 1175 | (defun eww--walk-readability (node callback &optional noscore) |
| 1178 | (let ((score -1)) | 1176 | "Walk through all children of NODE to score readability. |
| 1179 | (cond | 1177 | After scoring, call CALLBACK with the node and score. If NOSCORE is |
| 1180 | ((memq (dom-tag node) '(script head comment)) | 1178 | non-nil, don't actually compute a score; just call the callback." |
| 1181 | (setq score -2)) | 1179 | (let ((score nil)) |
| 1182 | ((eq (dom-tag node) 'meta) | 1180 | (unless noscore |
| 1183 | (setq score -1)) | 1181 | (cond |
| 1184 | ((eq (dom-tag node) 'img) | 1182 | ((stringp node) |
| 1185 | (setq score 2)) | 1183 | (setq score (length (split-string node)) |
| 1186 | ((eq (dom-tag node) 'a) | 1184 | noscore t)) |
| 1187 | (setq score (- (length (split-string (dom-text node)))))) | 1185 | ((memq (dom-tag node) '(script head comment)) |
| 1188 | (t | 1186 | (setq score -2 |
| 1187 | noscore t)) | ||
| 1188 | ((eq (dom-tag node) 'meta) | ||
| 1189 | (setq score -1 | ||
| 1190 | noscore t)) | ||
| 1191 | ((eq (dom-tag node) 'img) | ||
| 1192 | (setq score 2 | ||
| 1193 | noscore t)) | ||
| 1194 | ((eq (dom-tag node) 'a) | ||
| 1195 | (setq score (- (length (split-string (dom-text node)))) | ||
| 1196 | noscore t)) | ||
| 1197 | (t | ||
| 1198 | (setq score -1)))) | ||
| 1199 | (when (consp node) | ||
| 1189 | (dolist (elem (dom-children node)) | 1200 | (dolist (elem (dom-children node)) |
| 1190 | (cond | 1201 | (let ((subscore (eww--walk-readability elem callback noscore))) |
| 1191 | ((stringp elem) | 1202 | (when (and (not noscore) subscore) |
| 1192 | (setq score (+ score (length (split-string elem))))) | 1203 | (incf score subscore))))) |
| 1193 | ((consp elem) | 1204 | (funcall callback node score) |
| 1194 | (setq score (+ score | ||
| 1195 | (or (cdr (assoc :eww-readability-score (cdr elem))) | ||
| 1196 | (eww-score-readability elem))))))))) | ||
| 1197 | ;; Cache the score of the node to avoid recomputing all the time. | ||
| 1198 | (dom-set-attribute node :eww-readability-score score) | ||
| 1199 | score)) | 1205 | score)) |
| 1200 | 1206 | ||
| 1207 | (defun eww-readable-dom (dom) | ||
| 1208 | "Return a readable version of DOM." | ||
| 1209 | (let ((head-nodes nil) | ||
| 1210 | (best-node nil) | ||
| 1211 | (best-score most-negative-fixnum)) | ||
| 1212 | (eww--walk-readability | ||
| 1213 | dom | ||
| 1214 | (lambda (node score) | ||
| 1215 | (when (consp node) | ||
| 1216 | (when (and score (> score best-score) | ||
| 1217 | ;; We set a lower bound to how long we accept that | ||
| 1218 | ;; the readable portion of the page is going to be. | ||
| 1219 | (> (length (split-string (dom-texts node))) 100)) | ||
| 1220 | (setq best-score score | ||
| 1221 | best-node node)) | ||
| 1222 | ;; Keep track of any <title> and <link> tags we find to include | ||
| 1223 | ;; in the final document. EWW uses them for various features, | ||
| 1224 | ;; like renaming the buffer or navigating to "next" and | ||
| 1225 | ;; "previous" pages. NOTE: We could probably filter out | ||
| 1226 | ;; stylesheet <link> tags here, though it doesn't really matter | ||
| 1227 | ;; since we don't *do* anything with stylesheets... | ||
| 1228 | (when (memq (dom-tag node) '(title link)) | ||
| 1229 | ;; Copy the node, but not any of its (non-text) children. | ||
| 1230 | ;; This way, we can ensure that we don't include a node | ||
| 1231 | ;; directly in our list in addition to as a child of some | ||
| 1232 | ;; other node in the list. This is ok for <title> and <link> | ||
| 1233 | ;; tags, but might need changed if supporting other tags. | ||
| 1234 | (let* ((inner-text (dom-texts node "")) | ||
| 1235 | (new-node `(,(dom-tag node) | ||
| 1236 | ,(dom-attributes node) | ||
| 1237 | ,@(when (length> inner-text 0) | ||
| 1238 | (list inner-text))))) | ||
| 1239 | (push new-node head-nodes)))))) | ||
| 1240 | (if (and best-node (not (eq best-node dom))) | ||
| 1241 | `(html nil | ||
| 1242 | (head nil ,@head-nodes) | ||
| 1243 | (body nil ,best-node)) | ||
| 1244 | dom))) | ||
| 1245 | |||
| 1246 | (defun eww-score-readability (node) | ||
| 1247 | (declare (obsolete 'eww--walk-readability "31.1")) | ||
| 1248 | (eww--walk-readability | ||
| 1249 | node | ||
| 1250 | (lambda (node score) | ||
| 1251 | (when (and score (consp node)) | ||
| 1252 | (dom-set-attribute node :eww-readability-score score))))) | ||
| 1253 | |||
| 1201 | (defun eww-highest-readability (node) | 1254 | (defun eww-highest-readability (node) |
| 1255 | (declare (obsolete 'eww-readable-dom "31.1")) | ||
| 1202 | (let ((result node) | 1256 | (let ((result node) |
| 1203 | highest) | 1257 | highest) |
| 1204 | (dolist (elem (dom-non-text-children node)) | 1258 | (dolist (elem (dom-non-text-children node)) |
diff --git a/test/lisp/net/eww-tests.el b/test/lisp/net/eww-tests.el index e7c5a29ecd4..18cbd272991 100644 --- a/test/lisp/net/eww-tests.el +++ b/test/lisp/net/eww-tests.el | |||
| @@ -29,6 +29,21 @@ | |||
| 29 | The default just returns an empty list of headers and the URL as the | 29 | The default just returns an empty list of headers and the URL as the |
| 30 | body.") | 30 | body.") |
| 31 | 31 | ||
| 32 | (defvar eww-test--lots-of-words | ||
| 33 | (string-join (make-list 20 "All work and no play makes Jack a dull boy.") | ||
| 34 | " ") | ||
| 35 | "A long enough run of words to satisfy EWW's readable mode cutoff.") | ||
| 36 | |||
| 37 | (defvar eww-test--wordy-page | ||
| 38 | (concat "<html>" | ||
| 39 | "<head>" | ||
| 40 | "<title>Welcome to my home page</title>" | ||
| 41 | "<link rel=\"home\" href=\"somewhere.invalid\">" | ||
| 42 | "</head><body>" | ||
| 43 | "<a>This is an uninteresting sentence.</a>" | ||
| 44 | "<div>" eww-test--lots-of-words "</div>" | ||
| 45 | "</body></html>")) | ||
| 46 | |||
| 32 | (defmacro eww-test--with-mock-retrieve (&rest body) | 47 | (defmacro eww-test--with-mock-retrieve (&rest body) |
| 33 | "Evaluate BODY with a mock implementation of `eww-retrieve'. | 48 | "Evaluate BODY with a mock implementation of `eww-retrieve'. |
| 34 | This avoids network requests during our tests. Additionally, prepare a | 49 | This avoids network requests during our tests. Additionally, prepare a |
| @@ -201,19 +216,10 @@ This sets `eww-before-browse-history-function' to | |||
| 201 | (eww-test--with-mock-retrieve | 216 | (eww-test--with-mock-retrieve |
| 202 | (let* ((shr-width most-positive-fixnum) | 217 | (let* ((shr-width most-positive-fixnum) |
| 203 | (shr-use-fonts nil) | 218 | (shr-use-fonts nil) |
| 204 | (words (string-join | ||
| 205 | (make-list | ||
| 206 | 20 "All work and no play makes Jack a dull boy.") | ||
| 207 | " ")) | ||
| 208 | (eww-test--response-function | 219 | (eww-test--response-function |
| 209 | (lambda (_url) | 220 | (lambda (_url) |
| 210 | (concat "Content-Type: text/html\n\n" | 221 | (concat "Content-Type: text/html\n\n" |
| 211 | "<html><body>" | 222 | eww-test--wordy-page)))) |
| 212 | "<a>This is an uninteresting sentence.</a>" | ||
| 213 | "<div>" | ||
| 214 | words | ||
| 215 | "</div>" | ||
| 216 | "</body></html>")))) | ||
| 217 | (eww "example.invalid") | 223 | (eww "example.invalid") |
| 218 | ;; Make sure EWW renders the whole document. | 224 | ;; Make sure EWW renders the whole document. |
| 219 | (should-not (plist-get eww-data :readable)) | 225 | (should-not (plist-get eww-data :readable)) |
| @@ -224,7 +230,7 @@ This sets `eww-before-browse-history-function' to | |||
| 224 | ;; Now, EWW should render just the "readable" parts. | 230 | ;; Now, EWW should render just the "readable" parts. |
| 225 | (should (plist-get eww-data :readable)) | 231 | (should (plist-get eww-data :readable)) |
| 226 | (should (string-match-p | 232 | (should (string-match-p |
| 227 | (concat "\\`" (regexp-quote words) "\n*\\'") | 233 | (concat "\\`" (regexp-quote eww-test--lots-of-words) "\n*\\'") |
| 228 | (buffer-substring-no-properties (point-min) (point-max)))) | 234 | (buffer-substring-no-properties (point-min) (point-max)))) |
| 229 | (eww-readable 'toggle) | 235 | (eww-readable 'toggle) |
| 230 | ;; Finally, EWW should render the whole document again. | 236 | ;; Finally, EWW should render the whole document again. |
| @@ -240,11 +246,14 @@ This sets `eww-before-browse-history-function' to | |||
| 240 | (let* ((eww-test--response-function | 246 | (let* ((eww-test--response-function |
| 241 | (lambda (_url) | 247 | (lambda (_url) |
| 242 | (concat "Content-Type: text/html\n\n" | 248 | (concat "Content-Type: text/html\n\n" |
| 243 | "<html><body>Hello there</body></html>"))) | 249 | eww-test--wordy-page))) |
| 244 | (eww-readable-urls '("://example\\.invalid/"))) | 250 | (eww-readable-urls '("://example\\.invalid/"))) |
| 245 | (eww "example.invalid") | 251 | (eww "example.invalid") |
| 246 | ;; Make sure EWW uses "readable" mode. | 252 | ;; Make sure EWW uses "readable" mode. |
| 247 | (should (plist-get eww-data :readable))))) | 253 | (should (plist-get eww-data :readable)) |
| 254 | ;; Make sure the page include the <title> and <link> nodes. | ||
| 255 | (should (equal (plist-get eww-data :title) "Welcome to my home page")) | ||
| 256 | (should (equal (plist-get eww-data :home) "somewhere.invalid"))))) | ||
| 248 | 257 | ||
| 249 | (provide 'eww-tests) | 258 | (provide 'eww-tests) |
| 250 | ;; eww-tests.el ends here | 259 | ;; eww-tests.el ends here |