aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJim Porter2025-06-17 09:11:55 -0700
committerJim Porter2025-06-25 21:26:04 -0700
commit2bdcf0250acecdb0719203ae711aedf5baad1783 (patch)
tree542289c62c26eab813388a761f823f3f783c6e7a
parentfaae9f572ab3027c46800575a12a72c8c0eee7c1 (diff)
downloademacs-2bdcf0250acecdb0719203ae711aedf5baad1783.tar.gz
emacs-2bdcf0250acecdb0719203ae711aedf5baad1783.zip
When making a readable page in EWW, include the <title> and similar tags
* lisp/net/eww.el (eww--walk-readability, eww-readable-dom): New functions. (eww-display-html): Call 'eww-readable-dom'. (eww-readable): Call 'eww-readable-dom'. Don't copy over 'eww-data' properties that our new readable page can handle on its own. (eww-score-readability): Rewrite in terms of 'eww--walk-readability'. Make obsolete. (eww-highest-readability): Make obsolete. * test/lisp/net/eww-tests.el (eww-test--lots-of-words) (eww-test--wordy-page): New variables... (eww-test/readable/toggle-display): ... use them. (eww-test/readable/default-readable): Make sure that the readable page includes the <title> and <link> tags (bug#77299).
-rw-r--r--lisp/net/eww.el106
-rw-r--r--test/lisp/net/eww-tests.el35
2 files changed, 102 insertions, 39 deletions
diff --git a/lisp/net/eww.el b/lisp/net/eww.el
index c70e4f40ee4..f27b66c5dd5 100644
--- a/lisp/net/eww.el
+++ b/lisp/net/eww.el
@@ -863,8 +863,7 @@ This replaces the region with the preprocessed HTML."
863 (unless document 863 (unless document
864 (let ((dom (eww--parse-html-region (point) (point-max) charset))) 864 (let ((dom (eww--parse-html-region (point) (point-max) charset)))
865 (when (eww-default-readable-p url) 865 (when (eww-default-readable-p url)
866 (eww-score-readability dom) 866 (setq dom (eww-readable-dom dom))
867 (setq dom (eww-highest-readability dom))
868 (with-current-buffer buffer 867 (with-current-buffer buffer
869 (plist-put eww-data :readable t))) 868 (plist-put eww-data :readable t)))
870 (setq document (eww-document-base url dom)))) 869 (setq document (eww-document-base url dom))))
@@ -1163,42 +1162,97 @@ adds a new entry to `eww-history'."
1163 (eww--parse-html-region (point-min) (point-max)))) 1162 (eww--parse-html-region (point-min) (point-max))))
1164 (base (plist-get eww-data :url))) 1163 (base (plist-get eww-data :url)))
1165 (when make-readable 1164 (when make-readable
1166 (eww-score-readability dom) 1165 (setq dom (eww-readable-dom dom)))
1167 (setq dom (eww-highest-readability dom)))
1168 (when eww-readable-adds-to-history 1166 (when eww-readable-adds-to-history
1169 (eww-save-history) 1167 (eww-save-history)
1170 (eww--before-browse) 1168 (eww--before-browse)
1171 (dolist (elem '(:source :url :title :next :previous :up :peer)) 1169 (dolist (elem '(:source :url :peer))
1172 (plist-put eww-data elem (plist-get old-data elem)))) 1170 (plist-put eww-data elem (plist-get old-data elem))))
1173 (eww-display-document (eww-document-base base dom)) 1171 (eww-display-document (eww-document-base base dom))
1174 (plist-put eww-data :readable make-readable) 1172 (plist-put eww-data :readable make-readable)
1175 (eww--after-page-change))) 1173 (eww--after-page-change)))
1176 1174
1177(defun eww-score-readability (node) 1175(defun eww--walk-readability (node callback &optional noscore)
1178 (let ((score -1)) 1176 "Walk through all children of NODE to score readability.
1179 (cond 1177After scoring, call CALLBACK with the node and score. If NOSCORE is
1180 ((memq (dom-tag node) '(script head comment)) 1178non-nil, don't actually compute a score; just call the callback."
1181 (setq score -2)) 1179 (let ((score nil))
1182 ((eq (dom-tag node) 'meta) 1180 (unless noscore
1183 (setq score -1)) 1181 (cond
1184 ((eq (dom-tag node) 'img) 1182 ((stringp node)
1185 (setq score 2)) 1183 (setq score (length (split-string node))
1186 ((eq (dom-tag node) 'a) 1184 noscore t))
1187 (setq score (- (length (split-string (dom-text node)))))) 1185 ((memq (dom-tag node) '(script head comment))
1188 (t 1186 (setq score -2
1187 noscore t))
1188 ((eq (dom-tag node) 'meta)
1189 (setq score -1
1190 noscore t))
1191 ((eq (dom-tag node) 'img)
1192 (setq score 2
1193 noscore t))
1194 ((eq (dom-tag node) 'a)
1195 (setq score (- (length (split-string (dom-text node))))
1196 noscore t))
1197 (t
1198 (setq score -1))))
1199 (when (consp node)
1189 (dolist (elem (dom-children node)) 1200 (dolist (elem (dom-children node))
1190 (cond 1201 (let ((subscore (eww--walk-readability elem callback noscore)))
1191 ((stringp elem) 1202 (when (and (not noscore) subscore)
1192 (setq score (+ score (length (split-string elem))))) 1203 (incf score subscore)))))
1193 ((consp elem) 1204 (funcall callback node score)
1194 (setq score (+ score
1195 (or (cdr (assoc :eww-readability-score (cdr elem)))
1196 (eww-score-readability elem)))))))))
1197 ;; Cache the score of the node to avoid recomputing all the time.
1198 (dom-set-attribute node :eww-readability-score score)
1199 score)) 1205 score))
1200 1206
1207(defun eww-readable-dom (dom)
1208 "Return a readable version of DOM."
1209 (let ((head-nodes nil)
1210 (best-node nil)
1211 (best-score most-negative-fixnum))
1212 (eww--walk-readability
1213 dom
1214 (lambda (node score)
1215 (when (consp node)
1216 (when (and score (> score best-score)
1217 ;; We set a lower bound to how long we accept that
1218 ;; the readable portion of the page is going to be.
1219 (> (length (split-string (dom-texts node))) 100))
1220 (setq best-score score
1221 best-node node))
1222 ;; Keep track of any <title> and <link> tags we find to include
1223 ;; in the final document. EWW uses them for various features,
1224 ;; like renaming the buffer or navigating to "next" and
1225 ;; "previous" pages. NOTE: We could probably filter out
1226 ;; stylesheet <link> tags here, though it doesn't really matter
1227 ;; since we don't *do* anything with stylesheets...
1228 (when (memq (dom-tag node) '(title link))
1229 ;; Copy the node, but not any of its (non-text) children.
1230 ;; This way, we can ensure that we don't include a node
1231 ;; directly in our list in addition to as a child of some
1232 ;; other node in the list. This is ok for <title> and <link>
1233 ;; tags, but might need changed if supporting other tags.
1234 (let* ((inner-text (dom-texts node ""))
1235 (new-node `(,(dom-tag node)
1236 ,(dom-attributes node)
1237 ,@(when (length> inner-text 0)
1238 (list inner-text)))))
1239 (push new-node head-nodes))))))
1240 (if (and best-node (not (eq best-node dom)))
1241 `(html nil
1242 (head nil ,@head-nodes)
1243 (body nil ,best-node))
1244 dom)))
1245
1246(defun eww-score-readability (node)
1247 (declare (obsolete 'eww--walk-readability "31.1"))
1248 (eww--walk-readability
1249 node
1250 (lambda (node score)
1251 (when (and score (consp node))
1252 (dom-set-attribute node :eww-readability-score score)))))
1253
1201(defun eww-highest-readability (node) 1254(defun eww-highest-readability (node)
1255 (declare (obsolete 'eww-readable-dom "31.1"))
1202 (let ((result node) 1256 (let ((result node)
1203 highest) 1257 highest)
1204 (dolist (elem (dom-non-text-children node)) 1258 (dolist (elem (dom-non-text-children node))
diff --git a/test/lisp/net/eww-tests.el b/test/lisp/net/eww-tests.el
index e7c5a29ecd4..18cbd272991 100644
--- a/test/lisp/net/eww-tests.el
+++ b/test/lisp/net/eww-tests.el
@@ -29,6 +29,21 @@
29The default just returns an empty list of headers and the URL as the 29The default just returns an empty list of headers and the URL as the
30body.") 30body.")
31 31
32(defvar eww-test--lots-of-words
33 (string-join (make-list 20 "All work and no play makes Jack a dull boy.")
34 " ")
35 "A long enough run of words to satisfy EWW's readable mode cutoff.")
36
37(defvar eww-test--wordy-page
38 (concat "<html>"
39 "<head>"
40 "<title>Welcome to my home page</title>"
41 "<link rel=\"home\" href=\"somewhere.invalid\">"
42 "</head><body>"
43 "<a>This is an uninteresting sentence.</a>"
44 "<div>" eww-test--lots-of-words "</div>"
45 "</body></html>"))
46
32(defmacro eww-test--with-mock-retrieve (&rest body) 47(defmacro eww-test--with-mock-retrieve (&rest body)
33 "Evaluate BODY with a mock implementation of `eww-retrieve'. 48 "Evaluate BODY with a mock implementation of `eww-retrieve'.
34This avoids network requests during our tests. Additionally, prepare a 49This avoids network requests during our tests. Additionally, prepare a
@@ -201,19 +216,10 @@ This sets `eww-before-browse-history-function' to
201 (eww-test--with-mock-retrieve 216 (eww-test--with-mock-retrieve
202 (let* ((shr-width most-positive-fixnum) 217 (let* ((shr-width most-positive-fixnum)
203 (shr-use-fonts nil) 218 (shr-use-fonts nil)
204 (words (string-join
205 (make-list
206 20 "All work and no play makes Jack a dull boy.")
207 " "))
208 (eww-test--response-function 219 (eww-test--response-function
209 (lambda (_url) 220 (lambda (_url)
210 (concat "Content-Type: text/html\n\n" 221 (concat "Content-Type: text/html\n\n"
211 "<html><body>" 222 eww-test--wordy-page))))
212 "<a>This is an uninteresting sentence.</a>"
213 "<div>"
214 words
215 "</div>"
216 "</body></html>"))))
217 (eww "example.invalid") 223 (eww "example.invalid")
218 ;; Make sure EWW renders the whole document. 224 ;; Make sure EWW renders the whole document.
219 (should-not (plist-get eww-data :readable)) 225 (should-not (plist-get eww-data :readable))
@@ -224,7 +230,7 @@ This sets `eww-before-browse-history-function' to
224 ;; Now, EWW should render just the "readable" parts. 230 ;; Now, EWW should render just the "readable" parts.
225 (should (plist-get eww-data :readable)) 231 (should (plist-get eww-data :readable))
226 (should (string-match-p 232 (should (string-match-p
227 (concat "\\`" (regexp-quote words) "\n*\\'") 233 (concat "\\`" (regexp-quote eww-test--lots-of-words) "\n*\\'")
228 (buffer-substring-no-properties (point-min) (point-max)))) 234 (buffer-substring-no-properties (point-min) (point-max))))
229 (eww-readable 'toggle) 235 (eww-readable 'toggle)
230 ;; Finally, EWW should render the whole document again. 236 ;; Finally, EWW should render the whole document again.
@@ -240,11 +246,14 @@ This sets `eww-before-browse-history-function' to
240 (let* ((eww-test--response-function 246 (let* ((eww-test--response-function
241 (lambda (_url) 247 (lambda (_url)
242 (concat "Content-Type: text/html\n\n" 248 (concat "Content-Type: text/html\n\n"
243 "<html><body>Hello there</body></html>"))) 249 eww-test--wordy-page)))
244 (eww-readable-urls '("://example\\.invalid/"))) 250 (eww-readable-urls '("://example\\.invalid/")))
245 (eww "example.invalid") 251 (eww "example.invalid")
246 ;; Make sure EWW uses "readable" mode. 252 ;; Make sure EWW uses "readable" mode.
247 (should (plist-get eww-data :readable))))) 253 (should (plist-get eww-data :readable))
254 ;; Make sure the page include the <title> and <link> nodes.
255 (should (equal (plist-get eww-data :title) "Welcome to my home page"))
256 (should (equal (plist-get eww-data :home) "somewhere.invalid")))))
248 257
249(provide 'eww-tests) 258(provide 'eww-tests)
250;; eww-tests.el ends here 259;; eww-tests.el ends here