diff --git a/content.js b/content.js index c84f8b8ce..86282def4 100644 --- a/content.js +++ b/content.js @@ -24,10 +24,12 @@ function pageTitle() { } function getReadableDocument() { - // Readability directly change the passed document so clone it so as - // to preserve the original web page. + // Readability directly change the passed document, so clone to preserve the original web page. const documentCopy = document.cloneNode(true); - const readability = new Readability(documentCopy); + const readability = new Readability(documentCopy, { + serializer: el => el // so that .content is returned as DOM element instead of HTML + }); + const article = readability.parse(); if (!article) { @@ -36,7 +38,7 @@ function getReadableDocument() { return { title: article.title, - body: article.articleContent, + body: article.content, } } diff --git a/lib/JSDOMParser.js b/lib/JSDOMParser.js index 30c7d4cd9..7bfa2acf5 100644 --- a/lib/JSDOMParser.js +++ b/lib/JSDOMParser.js @@ -278,7 +278,7 @@ var whitespace = [" ", "\t", "\n", "\r"]; - // See http://www.w3schools.com/dom/dom_nodetype.asp + // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType var nodeTypes = { ELEMENT_NODE: 1, ATTRIBUTE_NODE: 2, @@ -705,7 +705,6 @@ } // Using Array.join() avoids the overhead from lazy string concatenation. - // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes var arr = []; getHTML(this); return arr.join(""); @@ -875,7 +874,11 @@ JSDOMParser.prototype = { error: function(m) { - dump("JSDOMParser error: " + m + "\n"); + if (typeof dump !== "undefined") { + dump("JSDOMParser error: " + m + "\n"); + } else if (typeof console !== "undefined") { + console.log("JSDOMParser error: " + m + "\n"); + } this.errorState += m + "\n"; }, @@ -1187,3 +1190,7 @@ global.JSDOMParser = JSDOMParser; })(this); + +if (typeof module === "object") { + module.exports = this.JSDOMParser; +} diff --git a/lib/Readability-readerable.js b/lib/Readability-readerable.js index 73060fc05..64be5e15e 100644 --- a/lib/Readability-readerable.js +++ b/lib/Readability-readerable.js @@ -1,5 +1,4 @@ /* eslint-env es6:false */ -/* globals exports */ /* * Copyright (c) 2010 Arc90 Inc * @@ -31,22 +30,30 @@ var REGEXPS = { function isNodeVisible(node) { // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. return (!node.style || node.style.display != "none") - && !node.hasAttribute("hidden") - //check for "fallback-image" so that wikimedia math images are displayed - && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); } /** * Decides whether or not the document is reader-able without parsing the whole thing. - * - * @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object. + * @param {Object} options Configuration object. + * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable. + * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable. + * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible. + * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object. */ -function isProbablyReaderable(doc, isVisible) { - if (!isVisible) { - isVisible = isNodeVisible; +function isProbablyReaderable(doc, options = {}) { + // For backward compatibility reasons 'options' can either be a configuration object or the function used + // to determine if a node is visible. + if (typeof options == "function") { + options = { visibilityChecker: options }; } - var nodes = doc.querySelectorAll("p, pre"); + var defaultOptions = { minScore: 20, minContentLength: 140, visibilityChecker: isNodeVisible }; + options = Object.assign(defaultOptions, options); + + var nodes = doc.querySelectorAll("p, pre, article"); // Get
nodes which have
node(s) and append them into the `nodes` variable. // Some articles' DOM structures might look like @@ -58,7 +65,7 @@ function isProbablyReaderable(doc, isVisible) { var brNodes = doc.querySelectorAll("div > br"); if (brNodes.length) { var set = new Set(nodes); - [].forEach.call(brNodes, function(node) { + [].forEach.call(brNodes, function (node) { set.add(node.parentNode); }); nodes = Array.from(set); @@ -67,9 +74,10 @@ function isProbablyReaderable(doc, isVisible) { var score = 0; // This is a little cheeky, we use the accumulator 'score' to decide what to return from // this callback: - return [].some.call(nodes, function(node) { - if (!isVisible(node)) + return [].some.call(nodes, function (node) { + if (!options.visibilityChecker(node)) { return false; + } var matchString = node.className + " " + node.id; if (REGEXPS.unlikelyCandidates.test(matchString) && @@ -82,19 +90,19 @@ function isProbablyReaderable(doc, isVisible) { } var textContentLength = node.textContent.trim().length; - if (textContentLength < 140) { + if (textContentLength < options.minContentLength) { return false; } - score += Math.sqrt(textContentLength - 140); + score += Math.sqrt(textContentLength - options.minContentLength); - if (score > 20) { + if (score > options.minScore) { return true; } return false; }); } -if (typeof exports === "object") { - exports.isProbablyReaderable = isProbablyReaderable; +if (typeof module === "object") { + module.exports = isProbablyReaderable; } diff --git a/lib/Readability.js b/lib/Readability.js index 359fd947e..ce06df459 100644 --- a/lib/Readability.js +++ b/lib/Readability.js @@ -50,37 +50,42 @@ function Readability(doc, options) { this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); this._keepClasses = !!options.keepClasses; + this._serializer = options.serializer || function(el) { + return el.innerHTML; + }; + this._disableJSONLD = !!options.disableJSONLD; // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | - this.FLAG_WEIGHT_CLASSES | - this.FLAG_CLEAN_CONDITIONALLY; + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; - var logEl; // Control whether log messages are sent to the console if (this._debug) { - logEl = function(e) { - var rv = e.nodeName + " "; - if (e.nodeType == e.TEXT_NODE) { - return rv + '("' + e.textContent + '")'; + let logNode = function(node) { + if (node.nodeType == node.TEXT_NODE) { + return `${node.nodeName} ("${node.textContent}")`; } - var classDesc = e.className && ("." + e.className.replace(/ /g, ".")); - var elDesc = ""; - if (e.id) - elDesc = "(#" + e.id + classDesc + ")"; - else if (classDesc) - elDesc = "(" + classDesc + ")"; - return rv + elDesc; + let attrPairs = Array.from(node.attributes || [], function(attr) { + return `${attr.name}="${attr.value}"`; + }).join(" "); + return `<${node.localName} ${attrPairs}>`; }; this.log = function () { if (typeof dump !== "undefined") { var msg = Array.prototype.map.call(arguments, function(x) { - return (x && x.nodeName) ? logEl(x) : x; + return (x && x.nodeName) ? logNode(x) : x; }).join(" "); dump("Reader: (Readability) " + msg + "\n"); } else if (typeof console !== "undefined") { - var args = ["Reader: (Readability) "].concat(arguments); + let args = Array.from(arguments, arg => { + if (arg && arg.nodeType == this.ELEMENT_NODE) { + return logNode(arg); + } + return arg; + }); + args.unshift("Reader: (Readability)"); console.log.apply(console, args); } }; @@ -120,7 +125,7 @@ Readability.prototype = { okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, @@ -129,11 +134,19 @@ Readability.prototype = { shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, + tokenize: /\W+/g, whitespace: /^\s*$/, hasContent: /\S$/, + hashUrl: /^#.+/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ }, - DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], + UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + + DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], @@ -155,16 +168,27 @@ Readability.prototype = { // These are the classes that readability sets itself. CLASSES_TO_PRESERVE: [ "page" ], + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + /** * Run any post-process modifications to article content as necessary. * * @param Element * @return void - **/ + **/ _postProcessContent: function(articleContent) { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); + this._simplifyNestedElements(articleContent); + if (!this._keepClasses) { // Remove classes. this._cleanClasses(articleContent); @@ -209,8 +233,7 @@ Readability.prototype = { if (this._docJSDOMParser && nodeList._isLiveNodeList) { throw new Error("Do not pass live node lists to _replaceNodeTags"); } - for (var i = nodeList.length - 1; i >= 0; i--) { - var node = nodeList[i]; + for (const node of nodeList) { this._setNodeTag(node, newTagName); } }, @@ -230,6 +253,21 @@ Readability.prototype = { Array.prototype.forEach.call(nodeList, fn, this); }, + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + /** * Iterate over a NodeList, return true if any of the provided iterate * function calls returns true, false otherwise. @@ -296,11 +334,11 @@ Readability.prototype = { _cleanClasses: function(node) { var classesToPreserve = this._classesToPreserve; var className = (node.getAttribute("class") || "") - .split(/\s+/) - .filter(function(cls) { - return classesToPreserve.indexOf(cls) != -1; - }) - .join(" "); + .split(/\s+/) + .filter(function(cls) { + return classesToPreserve.indexOf(cls) != -1; + }) + .join(" "); if (className) { node.setAttribute("class", className); @@ -328,6 +366,7 @@ Readability.prototype = { if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; } + // Otherwise, resolve against base URI: try { return new URL(uri, baseURI).href; @@ -351,8 +390,8 @@ Readability.prototype = { } else { // if the link has multiple children, they should all be preserved var container = this._doc.createElement("span"); - while (link.childNodes.length > 0) { - container.appendChild(link.childNodes[0]); + while (link.firstChild) { + container.appendChild(link.firstChild); } link.parentNode.replaceChild(container, link); } @@ -362,19 +401,60 @@ Readability.prototype = { } }); - var imgs = this._getAllNodesWithTag(articleContent, ["img"]); - this._forEachNode(imgs, function(img) { - var src = img.getAttribute("src"); + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + if (src) { - img.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); } }); }, + _simplifyNestedElements: function(articleContent) { + var node = articleContent; + + while (node) { + if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if (this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + var child = node.children[0]; + for (var i = 0; i < node.attributes.length; i++) { + child.setAttribute(node.attributes[i].name, node.attributes[i].value); + } + node.parentNode.replaceChild(child, node); + node = child; + continue; + } + } + + node = this._getNextNode(node); + } + }, + /** * Get the article title as an H1. * - * @return void + * @return string **/ _getArticleTitle: function() { var doc = this._doc; @@ -407,8 +487,8 @@ Readability.prototype = { // Check if we have an heading containing this exact string, so we // could assume it's the full title. var headings = this._concatNodeLists( - doc.getElementsByTagName("h1"), - doc.getElementsByTagName("h2") + doc.getElementsByTagName("h1"), + doc.getElementsByTagName("h2") ); var trimmedTitle = curTitle.trim(); var match = this._someNode(headings, function(heading) { @@ -443,7 +523,7 @@ Readability.prototype = { var curTitleWordCount = wordCount(curTitle); if (curTitleWordCount <= 4 && (!titleHadHierarchicalSeparators || - curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { curTitle = origTitle; } @@ -470,15 +550,15 @@ Readability.prototype = { }, /** - * Finds the next element, starting from the given node, and ignoring + * Finds the next node, starting from the given node, and ignoring * whitespace in between. If the given node is an element, the same node is * returned. */ - _nextElement: function (node) { + _nextNode: function (node) { var next = node; while (next - && (next.nodeType != this.ELEMENT_NODE) - && this.REGEXPS.whitespace.test(next.textContent)) { + && (next.nodeType != this.ELEMENT_NODE) + && this.REGEXPS.whitespace.test(next.textContent)) { next = next.nextSibling; } return next; @@ -499,10 +579,10 @@ Readability.prototype = { //

block. var replaced = false; - // If we find a
chain, remove the
s until we hit another element + // If we find a
chain, remove the
s until we hit another node // or non-whitespace. This leaves behind the first
in the chain // (which will be replaced with a

later). - while ((next = this._nextElement(next)) && (next.tagName == "BR")) { + while ((next = this._nextNode(next)) && (next.tagName == "BR")) { replaced = true; var brSibling = next.nextSibling; next.parentNode.removeChild(next); @@ -520,7 +600,7 @@ Readability.prototype = { while (next) { // If we've hit another

, we're done adding children to this

. if (next.tagName == "BR") { - var nextElem = this._nextElement(next.nextSibling); + var nextElem = this._nextNode(next.nextSibling); if (nextElem && nextElem.tagName == "BR") break; } @@ -597,7 +677,6 @@ Readability.prototype = { this._cleanConditionally(articleContent, "fieldset"); this._clean(articleContent, "object"); this._clean(articleContent, "embed"); - this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); this._clean(articleContent, "link"); this._clean(articleContent, "aside"); @@ -613,25 +692,6 @@ Readability.prototype = { }); }); - // If there is only one h2 and its text content substantially equals article title, - // they are probably using it as a header and not a subheader, - // so remove it since we already extract the title separately. - var h2 = articleContent.getElementsByTagName("h2"); - if (h2.length === 1) { - var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; - if (Math.abs(lengthSimilarRate) < 0.5) { - var titlesMatch = false; - if (lengthSimilarRate > 0) { - titlesMatch = h2[0].textContent.includes(this._articleTitle); - } else { - titlesMatch = this._articleTitle.includes(h2[0].textContent); - } - if (titlesMatch) { - this._clean(articleContent, "h2"); - } - } - } - this._clean(articleContent, "iframe"); this._clean(articleContent, "input"); this._clean(articleContent, "textarea"); @@ -645,6 +705,9 @@ Readability.prototype = { this._cleanConditionally(articleContent, "ul"); this._cleanConditionally(articleContent, "div"); + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + // Remove extra paragraphs this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { var imgCount = paragraph.getElementsByTagName("img").length; @@ -658,7 +721,7 @@ Readability.prototype = { }); this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { - var next = this._nextElement(br.nextSibling); + var next = this._nextNode(br.nextSibling); if (next && next.tagName == "P") br.parentNode.removeChild(br); }); @@ -683,7 +746,7 @@ Readability.prototype = { * * @param Element * @return void - **/ + **/ _initializeNode: function(node) { node.readability = {"contentScore": 0}; @@ -754,6 +817,21 @@ Readability.prototype = { return node && node.nextElementSibling; }, + // compares second text to first one + // 1 = same text, 0 = completely different text + // works the way that it splits both texts into words and then finds words that are unique in second text + // the result is given by the lower length of unique parts + _textSimilarity: function(textA, textB) { + var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + if (!tokensA.length || !tokensB.length) { + return 0; + } + var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; + return 1 - distanceB; + }, + _checkByline: function(node, matchString) { if (this._articleByline) { return false; @@ -790,11 +868,11 @@ Readability.prototype = { * * @param page a document to run upon. Needs to be a full document, complete with body. * @return Element - **/ + **/ _grabArticle: function (page) { this.log("**** grabArticle ****"); var doc = this._doc; - var isPaging = (page !== null ? true: false); + var isPaging = page !== null; page = page ? page : this._doc.body; // We can't grab an article if we don't have a page! @@ -806,6 +884,7 @@ Readability.prototype = { var pageCacheHtml = page.innerHTML; while (true) { + this.log("Starting grabArticle loop"); var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); // First, node prepping. Trash nodes that look cruddy (like ones with the @@ -814,7 +893,14 @@ Readability.prototype = { var elementsToScore = []; var node = this._doc.documentElement; + let shouldRemoveTitleHeader = true; + while (node) { + + if (node.tagName === "HTML") { + this._articleLang = node.getAttribute("lang"); + } + var matchString = node.className + " " + node.id; if (!this._isProbablyVisible(node)) { @@ -829,23 +915,37 @@ Readability.prototype = { continue; } + if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { + this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + shouldRemoveTitleHeader = false; + node = this._removeAndGetNext(node); + continue; + } + // Remove unlikely candidates if (stripUnlikelyCandidates) { if (this.REGEXPS.unlikelyCandidates.test(matchString) && !this.REGEXPS.okMaybeItsACandidate.test(matchString) && !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && node.tagName !== "BODY" && node.tagName !== "A") { this.log("Removing unlikely candidate - " + matchString); node = this._removeAndGetNext(node); continue; } + + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { + this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + node = this._removeAndGetNext(node); + continue; + } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || - node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || - node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && + node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || + node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && this._isElementWithoutContent(node)) { node = this._removeAndGetNext(node); continue; @@ -901,7 +1001,7 @@ Readability.prototype = { * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ + **/ var candidates = []; this._forEachNode(elementsToScore, function(elementToScore) { if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") @@ -913,7 +1013,7 @@ Readability.prototype = { return; // Exclude nodes with no ancestor. - var ancestors = this._getNodeAncestors(elementToScore, 3); + var ancestors = this._getNodeAncestors(elementToScore, 5); if (ancestors.length === 0) return; @@ -990,10 +1090,9 @@ Readability.prototype = { neededToCreateTopCandidate = true; // Move everything (not just elements, also text nodes etc.) into the container // so we even include text directly in the body: - var kids = page.childNodes; - while (kids.length) { - this.log("Moving child out:", kids[0]); - topCandidate.appendChild(kids[0]); + while (page.firstChild) { + this.log("Moving child out:", page.firstChild); + topCandidate.appendChild(page.firstChild); } page.appendChild(topCandidate); @@ -1106,7 +1205,7 @@ Readability.prototype = { if (nodeLength > 80 && linkDensity < 0.25) { append = true; } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1) { + nodeContent.search(/\.( |$)/) !== -1) { append = true; } } @@ -1124,6 +1223,9 @@ Readability.prototype = { } articleContent.appendChild(sibling); + // Fetch children again to make it compatible + // with DOM parsers without live collection support. + siblings = parentOfTopCandidate.children; // siblings is a reference to the children array, and // sibling is removed from the array when we call appendChild(). // As a result, we must revisit this index since the nodes @@ -1151,9 +1253,8 @@ Readability.prototype = { var div = doc.createElement("DIV"); div.id = "readability-page-1"; div.className = "page"; - var children = articleContent.childNodes; - while (children.length) { - div.appendChild(children[0]); + while (articleContent.firstChild) { + div.appendChild(articleContent.firstChild); } articleContent.appendChild(div); } @@ -1233,12 +1334,127 @@ Readability.prototype = { return false; }, + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var metadata; + + this._forEachNode(scripts, function(jsonLdElement) { + if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org$/) + ) { + return; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return; + } + + metadata = {}; + + if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. + + var title = this._getArticleTitle(); + var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; + var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; + + if (headlineMatches && !nameMatches) { + metadata.title = parsed.headline; + } else { + metadata.title = parsed.name; + } + } else if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author) { + if (typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + metadata.byline = parsed.author + .filter(function(author) { + return author && typeof author.name === "string"; + }) + .map(function(author) { + return author.name.trim(); + }) + .join(", "); + } + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + return; + } catch (err) { + this.log(err.message); + } + } + }); + return metadata ? metadata : {}; + }, + /** * Attempts to get excerpt and byline metadata for the article. * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * * @return Object with optional "excerpt" and "byline" properties */ - _getArticleMetadata: function() { + _getArticleMetadata: function(jsonld) { var metadata = {}; var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); @@ -1263,13 +1479,11 @@ Readability.prototype = { if (elementProperty) { matches = elementProperty.match(propertyPattern); if (matches) { - for (var i = matches.length - 1; i >= 0; i--) { - // Convert to lowercase, and remove any whitespace - // so we can match below. - name = matches[i].toLowerCase().replace(/\s/g, ""); - // multiple authors - values[name] = content.trim(); - } + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[0].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); } } if (!matches && elementName && namePattern.test(elementName)) { @@ -1284,34 +1498,45 @@ Readability.prototype = { }); // get title - metadata.title = values["dc:title"] || - values["dcterm:title"] || - values["og:title"] || - values["weibo:article:title"] || - values["weibo:webpage:title"] || - values["title"] || - values["twitter:title"]; + metadata.title = jsonld.title || + values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"]; if (!metadata.title) { metadata.title = this._getArticleTitle(); } // get author - metadata.byline = values["dc:creator"] || - values["dcterm:creator"] || - values["author"]; + metadata.byline = jsonld.byline || + values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; // get description - metadata.excerpt = values["dc:description"] || - values["dcterm:description"] || - values["og:description"] || - values["weibo:article:description"] || - values["weibo:webpage:description"] || - values["description"] || - values["twitter:description"]; + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; // get site name - metadata.siteName = values["og:site_name"]; + metadata.siteName = jsonld.siteName || + values["og:site_name"]; + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); return metadata; }, @@ -1321,7 +1546,7 @@ Readability.prototype = { * whether as a direct child or as its descendants. * * @param Element - **/ + **/ _isSingleImage: function(node) { if (node.tagName === "IMG") { return true; @@ -1341,7 +1566,7 @@ Readability.prototype = { * some sites (e.g. Medium). * * @param Element - **/ + **/ _unwrapNoscriptImages: function(doc) { // Find img without source or attributes that might contains image, and remove it. // This is done to prevent a placeholder img is replaced by img from noscript in next step. @@ -1415,7 +1640,7 @@ Readability.prototype = { * Removes script tags from the document. * * @param Element - **/ + **/ _removeScripts: function(doc) { this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) { scriptNode.nodeValue = ""; @@ -1432,7 +1657,7 @@ Readability.prototype = { * * @param Element * @param string tag of child element - **/ + **/ _hasSingleTagInsideElement: function(element, tag) { // There should be exactly 1 element child with given tag if (element.children.length != 1 || element.children[0].tagName !== tag) { @@ -1442,15 +1667,15 @@ Readability.prototype = { // And there should be no text nodes with real content return !this._someNode(element.childNodes, function(node) { return node.nodeType === this.TEXT_NODE && - this.REGEXPS.hasContent.test(node.textContent); + this.REGEXPS.hasContent.test(node.textContent); }); }, _isElementWithoutContent: function(node) { return node.nodeType === this.ELEMENT_NODE && - node.textContent.trim().length == 0 && - (node.children.length == 0 || - node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); + node.textContent.trim().length == 0 && + (node.children.length == 0 || + node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); }, /** @@ -1460,24 +1685,24 @@ Readability.prototype = { */ _hasChildBlockElement: function (element) { return this._someNode(element.childNodes, function(node) { - return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 || - this._hasChildBlockElement(node); + return this.DIV_TO_P_ELEMS.has(node.tagName) || + this._hasChildBlockElement(node); }); }, /*** * Determine if a node qualifies as phrasing content. * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content - **/ + **/ _isPhrasingContent: function(node) { return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || - ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && - this._everyNode(node.childNodes, this._isPhrasingContent)); + ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && + this._everyNode(node.childNodes, this._isPhrasingContent)); }, _isWhitespace: function(node) { return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) || - (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); + (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); }, /** @@ -1487,7 +1712,7 @@ Readability.prototype = { * @param Element * @param Boolean normalizeSpaces (default: true) * @return string - **/ + **/ _getInnerText: function(e, normalizeSpaces) { normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces; var textContent = e.textContent.trim(); @@ -1504,7 +1729,7 @@ Readability.prototype = { * @param Element * @param string - what to split on. Default is "," * @return number (integer) - **/ + **/ _getCharCount: function(e, s) { s = s || ","; return this._getInnerText(e).split(s).length - 1; @@ -1516,7 +1741,7 @@ Readability.prototype = { * * @param Element * @return void - **/ + **/ _cleanStyles: function(e) { if (!e || e.tagName.toLowerCase() === "svg") return; @@ -1544,7 +1769,7 @@ Readability.prototype = { * * @param Element * @return number (float) - **/ + **/ _getLinkDensity: function(element) { var textLength = this._getInnerText(element).length; if (textLength === 0) @@ -1554,7 +1779,9 @@ Readability.prototype = { // XXX implement _reduceNodeList? this._forEachNode(element.getElementsByTagName("a"), function(linkNode) { - linkLength += this._getInnerText(linkNode).length; + var href = linkNode.getAttribute("href"); + var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1; + linkLength += this._getInnerText(linkNode).length * coefficient; }); return linkLength / textLength; @@ -1566,7 +1793,7 @@ Readability.prototype = { * * @param Element * @return number (Integer) - **/ + **/ _getClassWeight: function(e) { if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) return 0; @@ -1681,7 +1908,7 @@ Readability.prototype = { /** * Look for 'data' (as opposed to 'layout') tables, for which we use * similar checks as - * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920 + * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 */ _markDataTables: function(root) { var tables = root.getElementsByTagName("table"); @@ -1739,36 +1966,84 @@ Readability.prototype = { /* convert images and figures that have properties like data-src into images that can be loaded without JS */ _fixLazyImages: function (root) { this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) { - // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 - if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) { + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. + // So, here we check if the data uri is too short, just might as well remove it. + if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. + var parts = this.REGEXPS.b64DataUrl.exec(elem.src); + if (parts[1] === "image/svg+xml") { + return; + } + + // Make sure this element has other attributes which contains image. + // If it doesn't, then this src is important and shouldn't be removed. + var srcCouldBeRemoved = false; for (var i = 0; i < elem.attributes.length; i++) { var attr = elem.attributes[i]; - if (attr.name === "src" || attr.name === "srcset") { + if (attr.name === "src") { continue; } - var copyTo = null; - if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { - copyTo = "srcset"; - } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { - copyTo = "src"; + + if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { + srcCouldBeRemoved = true; + break; } - if (copyTo) { - //if this is an img or picture, set the attribute directly - if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { - elem.setAttribute(copyTo, attr.value); - } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) { - //if the item is a

that does not contain an image or picture, create one and place it inside the figure - //see the nytimes-3 testcase for an example - var img = this._doc.createElement("img"); - img.setAttribute(copyTo, attr.value); - elem.appendChild(img); - } + } + + // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) + // it will be too small, therefore it might be placeholder image. + if (srcCouldBeRemoved) { + var b64starts = elem.src.search(/base64\s*/i) + 7; + var b64length = elem.src.length - b64starts; + if (b64length < 133) { + elem.removeAttribute("src"); + } + } + } + + // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 + if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) { + return; + } + + for (var j = 0; j < elem.attributes.length; j++) { + attr = elem.attributes[j]; + if (attr.name === "src" || attr.name === "srcset" || attr.name === "alt") { + continue; + } + var copyTo = null; + if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { + copyTo = "srcset"; + } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { + copyTo = "src"; + } + if (copyTo) { + //if this is an img or picture, set the attribute directly + if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { + elem.setAttribute(copyTo, attr.value); + } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) { + //if the item is a
that does not contain an image or picture, create one and place it inside the figure + //see the nytimes-3 testcase for an example + var img = this._doc.createElement("img"); + img.setAttribute(copyTo, attr.value); + elem.appendChild(img); } } } }); }, + _getTextDensity: function(e, tags) { + var textLength = this._getInnerText(e, true).length; + if (textLength === 0) { + return 0; + } + var childrenLength = 0; + var children = this._getAllNodesWithTag(e, tags); + this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length); + return childrenLength / textLength; + }, + /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. @@ -1779,8 +2054,6 @@ Readability.prototype = { if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) return; - var isList = tag === "ul" || tag === "ol"; - // Gather counts for other typical elements embedded within. // Traverse backwards so we can remove nodes at the same time // without effecting the traversal. @@ -1792,6 +2065,14 @@ Readability.prototype = { return t._readabilityDataTable; }; + var isList = tag === "ul" || tag === "ol"; + if (!isList) { + var listLength = 0; + var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]); + this._forEachNode(listNodes, (list) => listLength += this._getInnerText(list).length); + isList = listLength / this._getInnerText(node).length > 0.9; + } + if (tag === "table" && isDataTable(node)) { return false; } @@ -1801,11 +2082,16 @@ Readability.prototype = { return false; } + if (this._hasAncestorTag(node, "code")) { + return false; + } + var weight = this._getClassWeight(node); - var contentScore = 0; this.log("Cleaning Conditionally", node); + var contentScore = 0; + if (weight + contentScore < 0) { return true; } @@ -1818,6 +2104,7 @@ Readability.prototype = { var img = node.getElementsByTagName("img").length; var li = node.getElementsByTagName("li").length - 100; var input = node.getElementsByTagName("input").length; + var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]); var embedCount = 0; var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]); @@ -1842,13 +2129,13 @@ Readability.prototype = { var contentLength = this._getInnerText(node).length; var haveToRemove = - (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || - (!isList && li > p) || - (input > Math.floor(p/3)) || - (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || - (!isList && weight < 25 && linkDensity > 0.2) || - (weight >= 25 && linkDensity > 0.5) || - ((embedCount === 1 && contentLength < 75) || embedCount > 1); + (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || + (!isList && li > p) || + (input > Math.floor(p/3)) || + (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || + (!isList && weight < 25 && linkDensity > 0.2) || + (weight >= 25 && linkDensity > 0.5) || + ((embedCount === 1 && contentLength < 75) || embedCount > 1); return haveToRemove; } return false; @@ -1875,17 +2162,38 @@ Readability.prototype = { }, /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. + * Clean out spurious headers from an Element. * * @param Element * @return void - **/ + **/ _cleanHeaders: function(e) { - this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) { - return this._getClassWeight(header) < 0; + let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); + this._removeNodes(headingNodes, function(node) { + let shouldRemove = this._getClassWeight(node) < 0; + if (shouldRemove) { + this.log("Removing header with low class weight:", node); + } + return shouldRemove; }); }, + /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param Element the node to check. + * @return boolean indicating whether this is a title-like header. + */ + _headerDuplicatesTitle: function(node) { + if (node.tagName != "H1" && node.tagName != "H2") { + return false; + } + var heading = this._getInnerText(node, false); + this.log("Evaluating similarity of header:", heading, this._articleTitle); + return this._textSimilarity(this._articleTitle, heading) > 0.75; + }, + _flagIsActive: function(flag) { return (this._flags & flag) > 0; }, @@ -1897,9 +2205,9 @@ Readability.prototype = { _isProbablyVisible: function(node) { // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. return (!node.style || node.style.display != "none") - && !node.hasAttribute("hidden") - //check for "fallback-image" so that wikimedia math images are displayed - && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); }, /** @@ -1926,12 +2234,15 @@ Readability.prototype = { // Unwrap image from noscript this._unwrapNoscriptImages(this._doc); + // Extract JSON-LD metadata before removing scripts + var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); + // Remove script tags from the document. this._removeScripts(this._doc); this._prepDocument(); - var metadata = this._getArticleMetadata(); + var metadata = this._getArticleMetadata(jsonLd); this._articleTitle = metadata.title; var articleContent = this._grabArticle(); @@ -1954,11 +2265,11 @@ Readability.prototype = { var textContent = articleContent.textContent; return { - articleContent: articleContent, title: this._articleTitle, byline: metadata.byline || this._articleByline, dir: this._articleDir, - content: articleContent.innerHTML, + lang: this._articleLang, + content: this._serializer(articleContent), textContent: textContent, length: textContent.length, excerpt: metadata.excerpt,