upgrade readability

2025-10-17 03:51:36 +08:00 · 2022-02-19 19:35:39 +01:00 · 2022-02-19 19:35:39 +01:00 · 20a4e2eb08
commit 20a4e2eb08
parent 9cf88d9697
4 changed files with 518 additions and 190 deletions
--- a/content.js
+++ b/content.js
@ -24,10 +24,12 @@ function pageTitle() {
 }

 function getReadableDocument() {
-	// Readability directly change the passed document so clone it so as
-	// to preserve the original web page.
+	// Readability directly change the passed document, so clone to preserve the original web page.
 	const documentCopy = document.cloneNode(true);
-	const readability = new Readability(documentCopy);
+	const readability = new Readability(documentCopy, {
+		serializer: el => el // so that .content is returned as DOM element instead of HTML
+	});
+
 	const article = readability.parse();

 	if (!article) {
@ -36,7 +38,7 @@ function getReadableDocument() {

 	return {
 		title: article.title,
-		body: article.articleContent,
+		body: article.content,
 	}
 }

--- a/lib/JSDOMParser.js
+++ b/lib/JSDOMParser.js
@ -278,7 +278,7 @@

  var whitespace = [" ", "\t", "\n", "\r"];

-  // See http://www.w3schools.com/dom/dom_nodetype.asp
+  // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
  var nodeTypes = {
    ELEMENT_NODE: 1,
    ATTRIBUTE_NODE: 2,
@ -705,7 +705,6 @@
      }

      // Using Array.join() avoids the overhead from lazy string concatenation.
-      // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
      var arr = [];
      getHTML(this);
      return arr.join("");
@ -875,7 +874,11 @@

  JSDOMParser.prototype = {
    error: function(m) {
-      dump("JSDOMParser error: " + m + "\n");
+      if (typeof dump !== "undefined") {
+        dump("JSDOMParser error: " + m + "\n");
+      } else if (typeof console !== "undefined") {
+        console.log("JSDOMParser error: " + m + "\n");
+      }
      this.errorState += m + "\n";
    },

@ -1187,3 +1190,7 @@
  global.JSDOMParser = JSDOMParser;

 })(this);
+
+if (typeof module === "object") {
+  module.exports = this.JSDOMParser;
+}
--- a/lib/Readability-readerable.js
+++ b/lib/Readability-readerable.js
@ -1,5 +1,4 @@
 /* eslint-env es6:false */
-/* globals exports */
 /*
 * Copyright (c) 2010 Arc90 Inc
 *
@ -31,22 +30,30 @@ var REGEXPS = {
 function isNodeVisible(node) {
  // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
  return (!node.style || node.style.display != "none")
-      && !node.hasAttribute("hidden")
-      //check for "fallback-image" so that wikimedia math images are displayed
-      && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
+    && !node.hasAttribute("hidden")
+    //check for "fallback-image" so that wikimedia math images are displayed
+    && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
 }

 /**
 * Decides whether or not the document is reader-able without parsing the whole thing.
- *
- * @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object.
+ * @param {Object} options Configuration object.
+ * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable.
+ * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable.
+ * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible.
+ * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object.
 */
-function isProbablyReaderable(doc, isVisible) {
-  if (!isVisible) {
-    isVisible = isNodeVisible;
+function isProbablyReaderable(doc, options = {}) {
+  // For backward compatibility reasons 'options' can either be a configuration object or the function used
+  // to determine if a node is visible.
+  if (typeof options == "function") {
+    options = { visibilityChecker: options };
  }

-  var nodes = doc.querySelectorAll("p, pre");
+  var defaultOptions = { minScore: 20, minContentLength: 140, visibilityChecker: isNodeVisible };
+  options = Object.assign(defaultOptions, options);
+
+  var nodes = doc.querySelectorAll("p, pre, article");

  // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
  // Some articles' DOM structures might look like
@ -58,7 +65,7 @@ function isProbablyReaderable(doc, isVisible) {
  var brNodes = doc.querySelectorAll("div > br");
  if (brNodes.length) {
    var set = new Set(nodes);
-    [].forEach.call(brNodes, function(node) {
+    [].forEach.call(brNodes, function (node) {
      set.add(node.parentNode);
    });
    nodes = Array.from(set);
@ -67,9 +74,10 @@ function isProbablyReaderable(doc, isVisible) {
  var score = 0;
  // This is a little cheeky, we use the accumulator 'score' to decide what to return from
  // this callback:
-  return [].some.call(nodes, function(node) {
-    if (!isVisible(node))
+  return [].some.call(nodes, function (node) {
+    if (!options.visibilityChecker(node)) {
      return false;
+    }

    var matchString = node.className + " " + node.id;
    if (REGEXPS.unlikelyCandidates.test(matchString) &&
@ -82,19 +90,19 @@ function isProbablyReaderable(doc, isVisible) {
    }

    var textContentLength = node.textContent.trim().length;
-    if (textContentLength < 140) {
+    if (textContentLength < options.minContentLength) {
      return false;
    }

-    score += Math.sqrt(textContentLength - 140);
+    score += Math.sqrt(textContentLength - options.minContentLength);

-    if (score > 20) {
+    if (score > options.minScore) {
      return true;
    }
    return false;
  });
 }

-if (typeof exports === "object") {
-  exports.isProbablyReaderable = isProbablyReaderable;
+if (typeof module === "object") {
+  module.exports = isProbablyReaderable;
 }
--- a/lib/Readability.js
+++ b/lib/Readability.js