diff --git a/apps/web-clipper/.gitignore b/apps/web-clipper/.gitignore new file mode 100644 index 000000000..77738287f --- /dev/null +++ b/apps/web-clipper/.gitignore @@ -0,0 +1 @@ +dist/ \ No newline at end of file diff --git a/apps/web-clipper/.idea/.gitignore b/apps/web-clipper/.idea/.gitignore new file mode 100644 index 000000000..c0f9e196c --- /dev/null +++ b/apps/web-clipper/.idea/.gitignore @@ -0,0 +1,5 @@ +# Default ignored files +/workspace.xml + +# Datasource local storage ignored files +/dataSources.local.xml \ No newline at end of file diff --git a/apps/web-clipper/.idea/inspectionProfiles/Project_Default.xml b/apps/web-clipper/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 000000000..146ab09b7 --- /dev/null +++ b/apps/web-clipper/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,10 @@ + + + + \ No newline at end of file diff --git a/apps/web-clipper/.idea/misc.xml b/apps/web-clipper/.idea/misc.xml new file mode 100644 index 000000000..7e5bdf89f --- /dev/null +++ b/apps/web-clipper/.idea/misc.xml @@ -0,0 +1,9 @@ + + + + + + + + \ No newline at end of file diff --git a/apps/web-clipper/.idea/modules.xml b/apps/web-clipper/.idea/modules.xml new file mode 100644 index 000000000..ebf785642 --- /dev/null +++ b/apps/web-clipper/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/apps/web-clipper/.idea/vcs.xml b/apps/web-clipper/.idea/vcs.xml new file mode 100644 index 000000000..94a25f7f4 --- /dev/null +++ b/apps/web-clipper/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/apps/web-clipper/LICENSE b/apps/web-clipper/LICENSE new file mode 100644 index 000000000..f288702d2 --- /dev/null +++ b/apps/web-clipper/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/apps/web-clipper/README.md b/apps/web-clipper/README.md new file mode 100644 index 000000000..a37d0e181 --- /dev/null +++ b/apps/web-clipper/README.md @@ -0,0 +1,24 @@ +# Trilium Web Clipper + +## This repo is dead + +**Trilium is in maintenance mode and Web Clipper is not likely to get new releases.** + +Trilium Web Clipper is a web browser extension which allows user to clip text, screenshots, whole pages and short notes and save them directly to [Trilium Notes](https://github.com/zadam/trilium). + +For more details, see the [wiki page](https://github.com/zadam/trilium/wiki/Web-clipper). + +## Keyboard shortcuts +Keyboard shortcuts are available for most functions: +* Save selected text: `Ctrl+Shift+S` (Mac: `Cmd+Shift+S`) +* Save whole page: `Alt+Shift+S` (Mac: `Opt+Shift+S`) +* Save screenshot: `Ctrl+Shift+E` (Mac: `Cmd+Shift+E`) + +To set custom shortcuts, follow the directions for your browser. + +**Firefox**: `about:addons` > Gear icon ⚙️ > Manage extension shortcuts + +**Chrome**: `chrome://extensions/shortcuts` + +## Credits +Some parts of the code are based on the [Joplin Notes browser extension](https://github.com/laurent22/joplin/tree/master/Clipper). diff --git a/apps/web-clipper/background.js b/apps/web-clipper/background.js new file mode 100644 index 000000000..4074987ab --- /dev/null +++ b/apps/web-clipper/background.js @@ -0,0 +1,451 @@ +// Keyboard shortcuts +chrome.commands.onCommand.addListener(async function (command) { + if (command == "saveSelection") { + await saveSelection(); + } else if (command == "saveWholePage") { + await saveWholePage(); + } else if (command == "saveTabs") { + await saveTabs(); + } else if (command == "saveCroppedScreenshot") { + const activeTab = await getActiveTab(); + + await saveCroppedScreenshot(activeTab.url); + } else { + console.log("Unrecognized command", command); + } +}); + +function cropImage(newArea, dataUrl) { + return new Promise((resolve, reject) => { + const img = new Image(); + + img.onload = function () { + const canvas = document.createElement('canvas'); + canvas.width = newArea.width; + canvas.height = newArea.height; + + const ctx = canvas.getContext('2d'); + + ctx.drawImage(img, newArea.x, newArea.y, newArea.width, newArea.height, 0, 0, newArea.width, newArea.height); + + resolve(canvas.toDataURL()); + }; + + img.src = dataUrl; + }); +} + +async function takeCroppedScreenshot(cropRect) { + const activeTab = await getActiveTab(); + const zoom = await browser.tabs.getZoom(activeTab.id) * window.devicePixelRatio; + + const newArea = Object.assign({}, cropRect); + newArea.x *= zoom; + newArea.y *= zoom; + newArea.width *= zoom; + newArea.height *= zoom; + + const dataUrl = await browser.tabs.captureVisibleTab(null, { format: 'png' }); + + return await cropImage(newArea, dataUrl); +} + +async function takeWholeScreenshot() { + // this saves only visible portion of the page + // workaround to save the whole page is to scroll & stitch + // example in https://github.com/mrcoles/full-page-screen-capture-chrome-extension + // see page.js and popup.js + return await browser.tabs.captureVisibleTab(null, { format: 'png' }); +} + +browser.runtime.onInstalled.addListener(() => { + if (isDevEnv()) { + browser.browserAction.setIcon({ + path: 'icons/32-dev.png', + }); + } +}); + +browser.contextMenus.create({ + id: "trilium-save-selection", + title: "Save selection to Trilium", + contexts: ["selection"] +}); + +browser.contextMenus.create({ + id: "trilium-save-cropped-screenshot", + title: "Clip screenshot to Trilium", + contexts: ["page"] +}); + +browser.contextMenus.create({ + id: "trilium-save-cropped-screenshot", + title: "Crop screen shot to Trilium", + contexts: ["page"] +}); + +browser.contextMenus.create({ + id: "trilium-save-whole-screenshot", + title: "Save whole screen shot to Trilium", + contexts: ["page"] +}); + +browser.contextMenus.create({ + id: "trilium-save-page", + title: "Save whole page to Trilium", + contexts: ["page"] +}); + +browser.contextMenus.create({ + id: "trilium-save-link", + title: "Save link to Trilium", + contexts: ["link"] +}); + +browser.contextMenus.create({ + id: "trilium-save-image", + title: "Save image to Trilium", + contexts: ["image"] +}); + +async function getActiveTab() { + const tabs = await browser.tabs.query({ + active: true, + currentWindow: true + }); + + return tabs[0]; +} + +async function getWindowTabs() { + const tabs = await browser.tabs.query({ + currentWindow: true + }); + + return tabs; +} + +async function sendMessageToActiveTab(message) { + const activeTab = await getActiveTab(); + + if (!activeTab) { + throw new Error("No active tab."); + } + + try { + return await browser.tabs.sendMessage(activeTab.id, message); + } + catch (e) { + throw e; + } +} + +function toast(message, noteId = null, tabIds = null) { + sendMessageToActiveTab({ + name: 'toast', + message: message, + noteId: noteId, + tabIds: tabIds + }); +} + +function blob2base64(blob) { + return new Promise(resolve => { + const reader = new FileReader(); + reader.onloadend = function() { + resolve(reader.result); + }; + reader.readAsDataURL(blob); + }); +} + +async function fetchImage(url) { + const resp = await fetch(url); + const blob = await resp.blob(); + + return await blob2base64(blob); +} + +async function postProcessImage(image) { + if (image.src.startsWith("data:image/")) { + image.dataUrl = image.src; + image.src = "inline." + image.src.substr(11, 3); // this should extract file type - png/jpg + } + else { + try { + image.dataUrl = await fetchImage(image.src, image); + } + catch (e) { + console.log(`Cannot fetch image from ${image.src}`); + } + } +} + +async function postProcessImages(resp) { + if (resp.images) { + for (const image of resp.images) { + await postProcessImage(image); + } + } +} + +async function saveSelection() { + const payload = await sendMessageToActiveTab({name: 'trilium-save-selection'}); + + await postProcessImages(payload); + + const resp = await triliumServerFacade.callService('POST', 'clippings', payload); + + if (!resp) { + return; + } + + toast("Selection has been saved to Trilium.", resp.noteId); +} + +async function getImagePayloadFromSrc(src, pageUrl) { + const image = { + imageId: randomString(20), + src: src + }; + + await postProcessImage(image); + + const activeTab = await getActiveTab(); + + return { + title: activeTab.title, + content: ``, + images: [image], + pageUrl: pageUrl + }; +} + +async function saveCroppedScreenshot(pageUrl) { + const cropRect = await sendMessageToActiveTab({name: 'trilium-get-rectangle-for-screenshot'}); + + const src = await takeCroppedScreenshot(cropRect); + + const payload = await getImagePayloadFromSrc(src, pageUrl); + + const resp = await triliumServerFacade.callService("POST", "clippings", payload); + + if (!resp) { + return; + } + + toast("Screenshot has been saved to Trilium.", resp.noteId); +} + +async function saveWholeScreenshot(pageUrl) { + const src = await takeWholeScreenshot(); + + const payload = await getImagePayloadFromSrc(src, pageUrl); + + const resp = await triliumServerFacade.callService("POST", "clippings", payload); + + if (!resp) { + return; + } + + toast("Screenshot has been saved to Trilium.", resp.noteId); +} + +async function saveImage(srcUrl, pageUrl) { + const payload = await getImagePayloadFromSrc(srcUrl, pageUrl); + + const resp = await triliumServerFacade.callService("POST", "clippings", payload); + + if (!resp) { + return; + } + + toast("Image has been saved to Trilium.", resp.noteId); +} + +async function saveWholePage() { + const payload = await sendMessageToActiveTab({name: 'trilium-save-page'}); + + await postProcessImages(payload); + + const resp = await triliumServerFacade.callService('POST', 'notes', payload); + + if (!resp) { + return; + } + + toast("Page has been saved to Trilium.", resp.noteId); +} + +async function saveLinkWithNote(title, content) { + const activeTab = await getActiveTab(); + + if (!title.trim()) { + title = activeTab.title; + } + + const resp = await triliumServerFacade.callService('POST', 'notes', { + title: title, + content: content, + clipType: 'note', + pageUrl: activeTab.url + }); + + if (!resp) { + return false; + } + + toast("Link with note has been saved to Trilium.", resp.noteId); + + return true; +} + +async function getTabsPayload(tabs) { + let content = '
    '; + tabs.forEach(tab => { + content += `
  • ${tab.title}
  • ` + }); + content += '
'; + + const domainsCount = tabs.map(tab => tab.url) + .reduce((acc, url) => { + const hostname = new URL(url).hostname + return acc.set(hostname, (acc.get(hostname) || 0) + 1) + }, new Map()); + + let topDomains = [...domainsCount] + .sort((a, b) => {return b[1]-a[1]}) + .slice(0,3) + .map(domain=>domain[0]) + .join(', ') + + if (tabs.length > 3) { topDomains += '...' } + + return { + title: `${tabs.length} browser tabs: ${topDomains}`, + content: content, + clipType: 'tabs' + }; +} + +async function saveTabs() { + const tabs = await getWindowTabs(); + + const payload = await getTabsPayload(tabs); + + const resp = await triliumServerFacade.callService('POST', 'notes', payload); + + if (!resp) { + return; + } + + const tabIds = tabs.map(tab=>{return tab.id}); + + toast(`${tabs.length} links have been saved to Trilium.`, resp.noteId, tabIds); +} + +browser.contextMenus.onClicked.addListener(async function(info, tab) { + if (info.menuItemId === 'trilium-save-selection') { + await saveSelection(); + } + else if (info.menuItemId === 'trilium-save-cropped-screenshot') { + await saveCroppedScreenshot(info.pageUrl); + } + else if (info.menuItemId === 'trilium-save-whole-screenshot') { + await saveWholeScreenshot(info.pageUrl); + } + else if (info.menuItemId === 'trilium-save-image') { + await saveImage(info.srcUrl, info.pageUrl); + } + else if (info.menuItemId === 'trilium-save-link') { + const link = document.createElement("a"); + link.href = info.linkUrl; + // linkText might be available only in firefox + link.appendChild(document.createTextNode(info.linkText || info.linkUrl)); + + const activeTab = await getActiveTab(); + + const resp = await triliumServerFacade.callService('POST', 'clippings', { + title: activeTab.title, + content: link.outerHTML, + pageUrl: info.pageUrl + }); + + if (!resp) { + return; + } + + toast("Link has been saved to Trilium.", resp.noteId); + } + else if (info.menuItemId === 'trilium-save-page') { + await saveWholePage(); + } + else { + console.log("Unrecognized menuItemId", info.menuItemId); + } +}); + +browser.runtime.onMessage.addListener(async request => { + console.log("Received", request); + + if (request.name === 'openNoteInTrilium') { + const resp = await triliumServerFacade.callService('POST', 'open/' + request.noteId); + + if (!resp) { + return; + } + + // desktop app is not available so we need to open in browser + if (resp.result === 'open-in-browser') { + const {triliumServerUrl} = await browser.storage.sync.get("triliumServerUrl"); + + if (triliumServerUrl) { + const noteUrl = triliumServerUrl + '/#' + request.noteId; + + console.log("Opening new tab in browser", noteUrl); + + browser.tabs.create({ + url: noteUrl + }); + } + else { + console.error("triliumServerUrl not found in local storage."); + } + } + } + else if (request.name === 'closeTabs') { + return await browser.tabs.remove(request.tabIds) + } + else if (request.name === 'load-script') { + return await browser.tabs.executeScript({file: request.file}); + } + else if (request.name === 'save-cropped-screenshot') { + const activeTab = await getActiveTab(); + + return await saveCroppedScreenshot(activeTab.url); + } + else if (request.name === 'save-whole-screenshot') { + const activeTab = await getActiveTab(); + + return await saveWholeScreenshot(activeTab.url); + } + else if (request.name === 'save-whole-page') { + return await saveWholePage(); + } + else if (request.name === 'save-link-with-note') { + return await saveLinkWithNote(request.title, request.content); + } + else if (request.name === 'save-tabs') { + return await saveTabs(); + } + else if (request.name === 'trigger-trilium-search') { + triliumServerFacade.triggerSearchForTrilium(); + } + else if (request.name === 'send-trilium-search-status') { + triliumServerFacade.sendTriliumSearchStatusToPopup(); + } + else if (request.name === 'trigger-trilium-search-note-url') { + const activeTab = await getActiveTab(); + triliumServerFacade.triggerSearchNoteByUrl(activeTab.url); + } +}); diff --git a/apps/web-clipper/bin/release-chrome.sh b/apps/web-clipper/bin/release-chrome.sh new file mode 100755 index 000000000..2b96c9419 --- /dev/null +++ b/apps/web-clipper/bin/release-chrome.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -e + +VERSION=$(jq -r ".version" manifest.json) +CHROME_EXTENSION_ID=dfhgmnfclbebfobmblelddiejjcijbjm + +BUILD_DIR=trilium-web-clipper-chrome + +rm -rf "dist/$BUILD_DIR" +mkdir -p "dist/$BUILD_DIR" + +cp -r icons lib options popup *.js manifest.json "dist/$BUILD_DIR" + +cd dist/"${BUILD_DIR}" || exit + +jq '.name = "Trilium Web Clipper"' manifest.json | sponge manifest.json +jq 'del(.browser_specific_settings)' manifest.json | sponge manifest.json + +EXT_FILE_NAME=trilium_web_clipper-${VERSION}-chrome.zip + +zip -r ../${EXT_FILE_NAME} * + +cd .. +rm -r "${BUILD_DIR}" + +# https://github.com/fregante/chrome-webstore-upload-cli +chrome-webstore-upload upload --source ${EXT_FILE_NAME} --auto-publish --extension-id "${CHROME_EXTENSION_ID}" --client-id "${CHROME_CLIENT_ID}" --client-secret "${CHROME_CLIENT_SECRET}" --refresh-token "${CHROME_REFRESH_TOKEN}" + diff --git a/apps/web-clipper/bin/release-firefox.sh b/apps/web-clipper/bin/release-firefox.sh new file mode 100755 index 000000000..38633ac7a --- /dev/null +++ b/apps/web-clipper/bin/release-firefox.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -e + +WEB_EXT_ID="{1410742d-b377-40e7-a9db-63dc9c6ec99c}" + +ARTIFACT_NAME=trilium-web-clipper-firefox +BUILD_DIR=dist/$ARTIFACT_NAME + +rm -rf "$BUILD_DIR" +mkdir -p "$BUILD_DIR" + +cp -r icons lib options popup *.js manifest.json "$BUILD_DIR" + +cd dist/"${ARTIFACT_NAME}" || exit + +jq '.name = "Trilium Web Clipper"' manifest.json | sponge manifest.json + +web-ext sign --api-key $FIREFOX_API_KEY --api-secret $FIREFOX_API_SECRET --artifacts-dir ../ + +cd .. +rm -r "${ARTIFACT_NAME}" diff --git a/apps/web-clipper/bin/release.sh b/apps/web-clipper/bin/release.sh new file mode 100755 index 000000000..69ba675e0 --- /dev/null +++ b/apps/web-clipper/bin/release.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -e + +export GITHUB_REPO=trilium-web-clipper + +if [[ $# -eq 0 ]] ; then + echo "Missing argument of new version" + exit 1 +fi + +VERSION=$1 + +if ! [[ ${VERSION} =~ ^[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}(-.+)?$ ]] ; +then + echo "Version ${VERSION} isn't in format X.Y.Z" + exit 1 +fi + +if ! git diff-index --quiet HEAD --; then + echo "There are uncommitted changes" + exit 1 +fi + +echo "Releasing Trilium Web Clipper $VERSION" + +jq '.version = "'"$VERSION"'"' manifest.json | sponge manifest.json + +git add manifest.json + +echo 'module.exports = { buildDate:"'$(date --iso-8601=seconds)'", buildRevision: "'$(git log -1 --format="%H")'" };' > build.js + +git add build.js + +TAG=v$VERSION + +echo "Committing package.json version change" + +git commit -m "release $VERSION" +git push + +echo "Tagging commit with $TAG" + +git tag "$TAG" +git push origin "$TAG" + +bin/release-firefox.sh +bin/release-chrome.sh + +FIREFOX_BUILD=trilium_web_clipper-$VERSION-an+fx.xpi +CHROME_BUILD=trilium_web_clipper-${VERSION}-chrome.zip + +echo "Creating release in GitHub" + +github-release release \ + --tag "$TAG" \ + --name "$TAG release" + +echo "Uploading firefox build package" + +github-release upload \ + --tag "$TAG" \ + --name "$FIREFOX_BUILD" \ + --file "dist/$FIREFOX_BUILD" + +echo "Uploading chrome build package" + +github-release upload \ + --tag "$TAG" \ + --name "$CHROME_BUILD" \ + --file "dist/$CHROME_BUILD" + +echo "Release finished!" diff --git a/apps/web-clipper/build.js b/apps/web-clipper/build.js new file mode 100644 index 000000000..3826b2524 --- /dev/null +++ b/apps/web-clipper/build.js @@ -0,0 +1 @@ +module.exports = { buildDate:"2022-10-29T15:25:37+02:00", buildRevision: "c9c10a90aa9b94efdf150b0b2fd57f9df5bf2d0a" }; diff --git a/apps/web-clipper/content.js b/apps/web-clipper/content.js new file mode 100644 index 000000000..faacfa546 --- /dev/null +++ b/apps/web-clipper/content.js @@ -0,0 +1,351 @@ +function absoluteUrl(url) { + if (!url) { + return url; + } + + const protocol = url.toLowerCase().split(':')[0]; + if (['http', 'https', 'file'].indexOf(protocol) >= 0) { + return url; + } + + if (url.indexOf('//') === 0) { + return location.protocol + url; + } else if (url[0] === '/') { + return location.protocol + '//' + location.host + url; + } else { + return getBaseUrl() + '/' + url; + } +} + +function pageTitle() { + const titleElements = document.getElementsByTagName("title"); + + return titleElements.length ? titleElements[0].text.trim() : document.title.trim(); +} + +function getReadableDocument() { + // Readability directly change the passed document, so clone to preserve the original web page. + const documentCopy = document.cloneNode(true); + const readability = new Readability(documentCopy, { + serializer: el => el // so that .content is returned as DOM element instead of HTML + }); + + const article = readability.parse(); + + if (!article) { + throw new Error('Could not parse HTML document with Readability'); + } + + return { + title: article.title, + body: article.content, + } +} + +function getDocumentDates() { + var dates = { + publishedDate: null, + modifiedDate: null, + }; + + const articlePublishedTime = document.querySelector("meta[property='article:published_time']"); + if (articlePublishedTime && articlePublishedTime.getAttribute('content')) { + dates.publishedDate = new Date(articlePublishedTime.getAttribute('content')); + } + + const articleModifiedTime = document.querySelector("meta[property='article:modified_time']"); + if (articleModifiedTime && articleModifiedTime.getAttribute('content')) { + dates.modifiedDate = new Date(articleModifiedTime.getAttribute('content')); + } + + // TODO: if we didn't get dates from meta, then try to get them from JSON-LD + + return dates; +} + +function getRectangleArea() { + return new Promise((resolve, reject) => { + const overlay = document.createElement('div'); + overlay.style.opacity = '0.6'; + overlay.style.background = 'black'; + overlay.style.width = '100%'; + overlay.style.height = '100%'; + overlay.style.zIndex = 99999999; + overlay.style.top = 0; + overlay.style.left = 0; + overlay.style.position = 'fixed'; + + document.body.appendChild(overlay); + + const messageComp = document.createElement('div'); + + const messageCompWidth = 300; + messageComp.setAttribute("tabindex", "0"); // so that it can be focused + messageComp.style.position = 'fixed'; + messageComp.style.opacity = '0.95'; + messageComp.style.fontSize = '14px'; + messageComp.style.width = messageCompWidth + 'px'; + messageComp.style.maxWidth = messageCompWidth + 'px'; + messageComp.style.border = '1px solid black'; + messageComp.style.background = 'white'; + messageComp.style.color = 'black'; + messageComp.style.top = '10px'; + messageComp.style.textAlign = 'center'; + messageComp.style.padding = '10px'; + messageComp.style.left = Math.round(document.body.clientWidth / 2 - messageCompWidth / 2) + 'px'; + messageComp.style.zIndex = overlay.style.zIndex + 1; + + messageComp.textContent = 'Drag and release to capture a screenshot'; + + document.body.appendChild(messageComp); + + const selection = document.createElement('div'); + selection.style.opacity = '0.5'; + selection.style.border = '1px solid red'; + selection.style.background = 'white'; + selection.style.border = '2px solid black'; + selection.style.zIndex = overlay.style.zIndex - 1; + selection.style.top = 0; + selection.style.left = 0; + selection.style.position = 'fixed'; + + document.body.appendChild(selection); + + messageComp.focus(); // we listen on keypresses on this element to cancel on escape + + let isDragging = false; + let draggingStartPos = null; + let selectionArea = {}; + + function updateSelection() { + selection.style.left = selectionArea.x + 'px'; + selection.style.top = selectionArea.y + 'px'; + selection.style.width = selectionArea.width + 'px'; + selection.style.height = selectionArea.height + 'px'; + } + + function setSelectionSizeFromMouse(event) { + if (event.clientX < draggingStartPos.x) { + selectionArea.x = event.clientX; + } + + if (event.clientY < draggingStartPos.y) { + selectionArea.y = event.clientY; + } + + selectionArea.width = Math.max(1, Math.abs(event.clientX - draggingStartPos.x)); + selectionArea.height = Math.max(1, Math.abs(event.clientY - draggingStartPos.y)); + updateSelection(); + } + + function selection_mouseDown(event) { + selectionArea = {x: event.clientX, y: event.clientY, width: 0, height: 0}; + draggingStartPos = {x: event.clientX, y: event.clientY}; + isDragging = true; + updateSelection(); + } + + function selection_mouseMove(event) { + if (!isDragging) return; + setSelectionSizeFromMouse(event); + } + + function removeOverlay() { + isDragging = false; + + overlay.removeEventListener('mousedown', selection_mouseDown); + overlay.removeEventListener('mousemove', selection_mouseMove); + overlay.removeEventListener('mouseup', selection_mouseUp); + + document.body.removeChild(overlay); + document.body.removeChild(selection); + document.body.removeChild(messageComp); + } + + function selection_mouseUp(event) { + setSelectionSizeFromMouse(event); + + removeOverlay(); + + console.info('selectionArea:', selectionArea); + + if (!selectionArea || !selectionArea.width || !selectionArea.height) { + return; + } + + // Need to wait a bit before taking the screenshot to make sure + // the overlays have been removed and don't appear in the + // screenshot. 10ms is not enough. + setTimeout(() => resolve(selectionArea), 100); + } + + function cancel(event) { + if (event.key === "Escape") { + removeOverlay(); + } + } + + overlay.addEventListener('mousedown', selection_mouseDown); + overlay.addEventListener('mousemove', selection_mouseMove); + overlay.addEventListener('mouseup', selection_mouseUp); + overlay.addEventListener('mouseup', selection_mouseUp); + messageComp.addEventListener('keydown', cancel); + }); +} + +function makeLinksAbsolute(container) { + for (const link of container.getElementsByTagName('a')) { + if (link.href) { + link.href = absoluteUrl(link.href); + } + } +} + +function getImages(container) { + const images = []; + + for (const img of container.getElementsByTagName('img')) { + if (!img.src) { + continue; + } + + const existingImage = images.find(image => image.src === img.src); + + if (existingImage) { + img.src = existingImage.imageId; + } + else { + const imageId = randomString(20); + + images.push({ + imageId: imageId, + src: img.src + }); + + img.src = imageId; + } + } + + return images; +} + +function createLink(clickAction, text, color = "lightskyblue") { + const link = document.createElement('a'); + link.href = "javascript:"; + link.style.color = color; + link.appendChild(document.createTextNode(text)); + link.addEventListener("click", () => { + browser.runtime.sendMessage(null, clickAction) + }); + + return link +} + +async function prepareMessageResponse(message) { + console.info('Message: ' + message.name); + + if (message.name === "toast") { + let messageText; + + if (message.noteId) { + messageText = document.createElement('p'); + messageText.setAttribute("style", "padding: 0; margin: 0; font-size: larger;") + messageText.appendChild(document.createTextNode(message.message + " ")); + messageText.appendChild(createLink( + {name: 'openNoteInTrilium', noteId: message.noteId}, + "Open in Trilium." + )); + + // only after saving tabs + if (message.tabIds) { + messageText.appendChild(document.createElement("br")); + messageText.appendChild(createLink( + {name: 'closeTabs', tabIds: message.tabIds}, + "Close saved tabs.", + "tomato" + )); + } + } + else { + messageText = message.message; + } + + await requireLib('/lib/toast.js'); + + showToast(messageText, { + settings: { + duration: 7000 + } + }); + } + else if (message.name === "trilium-save-selection") { + const container = document.createElement('div'); + + const selection = window.getSelection(); + + for (let i = 0; i < selection.rangeCount; i++) { + const range = selection.getRangeAt(i); + + container.appendChild(range.cloneContents()); + } + + makeLinksAbsolute(container); + + const images = getImages(container); + + return { + title: pageTitle(), + content: container.innerHTML, + images: images, + pageUrl: getPageLocationOrigin() + location.pathname + location.search + location.hash + }; + + } + else if (message.name === 'trilium-get-rectangle-for-screenshot') { + return getRectangleArea(); + } + else if (message.name === "trilium-save-page") { + await requireLib("/lib/JSDOMParser.js"); + await requireLib("/lib/Readability.js"); + await requireLib("/lib/Readability-readerable.js"); + + const {title, body} = getReadableDocument(); + + makeLinksAbsolute(body); + + const images = getImages(body); + + var labels = {}; + const dates = getDocumentDates(); + if (dates.publishedDate) { + labels['publishedDate'] = dates.publishedDate.toISOString().substring(0, 10); + } + if (dates.modifiedDate) { + labels['modifiedDate'] = dates.publishedDate.toISOString().substring(0, 10); + } + + return { + title: title, + content: body.innerHTML, + images: images, + pageUrl: getPageLocationOrigin() + location.pathname + location.search, + clipType: 'page', + labels: labels + }; + } + else { + throw new Error('Unknown command: ' + JSON.stringify(message)); + } +} + +browser.runtime.onMessage.addListener(prepareMessageResponse); + +const loadedLibs = []; + +async function requireLib(libPath) { + if (!loadedLibs.includes(libPath)) { + loadedLibs.push(libPath); + + await browser.runtime.sendMessage({name: 'load-script', file: libPath}); + } +} diff --git a/apps/web-clipper/icons/32-dev.png b/apps/web-clipper/icons/32-dev.png new file mode 100644 index 000000000..d280a31bb Binary files /dev/null and b/apps/web-clipper/icons/32-dev.png differ diff --git a/apps/web-clipper/icons/32.png b/apps/web-clipper/icons/32.png new file mode 100644 index 000000000..9aeeb66fe Binary files /dev/null and b/apps/web-clipper/icons/32.png differ diff --git a/apps/web-clipper/icons/48.png b/apps/web-clipper/icons/48.png new file mode 100644 index 000000000..da66c56f6 Binary files /dev/null and b/apps/web-clipper/icons/48.png differ diff --git a/apps/web-clipper/icons/96.png b/apps/web-clipper/icons/96.png new file mode 100644 index 000000000..f4783da58 Binary files /dev/null and b/apps/web-clipper/icons/96.png differ diff --git a/apps/web-clipper/lib/JSDOMParser.js b/apps/web-clipper/lib/JSDOMParser.js new file mode 100644 index 000000000..7bfa2acf5 --- /dev/null +++ b/apps/web-clipper/lib/JSDOMParser.js @@ -0,0 +1,1196 @@ +/*eslint-env es6:false*/ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * This is a relatively lightweight DOMParser that is safe to use in a web + * worker. This is far from a complete DOM implementation; however, it should + * contain the minimal set of functionality necessary for Readability.js. + * + * Aside from not implementing the full DOM API, there are other quirks to be + * aware of when using the JSDOMParser: + * + * 1) Properly formed HTML/XML must be used. This means you should be extra + * careful when using this parser on anything received directly from an + * XMLHttpRequest. Providing a serialized string from an XMLSerializer, + * however, should be safe (since the browser's XMLSerializer should + * generate valid HTML/XML). Therefore, if parsing a document from an XHR, + * the recommended approach is to do the XHR in the main thread, use + * XMLSerializer.serializeToString() on the responseXML, and pass the + * resulting string to the worker. + * + * 2) Live NodeLists are not supported. DOM methods and properties such as + * getElementsByTagName() and childNodes return standard arrays. If you + * want these lists to be updated when nodes are removed or added to the + * document, you must take care to manually update them yourself. + */ +(function (global) { + + // XML only defines these and the numeric ones: + + var entityTable = { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }; + + var reverseEntityTable = { + "<": "<", + ">": ">", + "&": "&", + '"': """, + "'": "'", + }; + + function encodeTextContentHTML(s) { + return s.replace(/[&<>]/g, function(x) { + return reverseEntityTable[x]; + }); + } + + function encodeHTML(s) { + return s.replace(/[&<>'"]/g, function(x) { + return reverseEntityTable[x]; + }); + } + + function decodeHTML(str) { + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) { + return entityTable[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); // read num + return String.fromCharCode(num); + }); + } + + // When a style is set in JS, map it to the corresponding CSS attribute + var styleMap = { + "alignmentBaseline": "alignment-baseline", + "background": "background", + "backgroundAttachment": "background-attachment", + "backgroundClip": "background-clip", + "backgroundColor": "background-color", + "backgroundImage": "background-image", + "backgroundOrigin": "background-origin", + "backgroundPosition": "background-position", + "backgroundPositionX": "background-position-x", + "backgroundPositionY": "background-position-y", + "backgroundRepeat": "background-repeat", + "backgroundRepeatX": "background-repeat-x", + "backgroundRepeatY": "background-repeat-y", + "backgroundSize": "background-size", + "baselineShift": "baseline-shift", + "border": "border", + "borderBottom": "border-bottom", + "borderBottomColor": "border-bottom-color", + "borderBottomLeftRadius": "border-bottom-left-radius", + "borderBottomRightRadius": "border-bottom-right-radius", + "borderBottomStyle": "border-bottom-style", + "borderBottomWidth": "border-bottom-width", + "borderCollapse": "border-collapse", + "borderColor": "border-color", + "borderImage": "border-image", + "borderImageOutset": "border-image-outset", + "borderImageRepeat": "border-image-repeat", + "borderImageSlice": "border-image-slice", + "borderImageSource": "border-image-source", + "borderImageWidth": "border-image-width", + "borderLeft": "border-left", + "borderLeftColor": "border-left-color", + "borderLeftStyle": "border-left-style", + "borderLeftWidth": "border-left-width", + "borderRadius": "border-radius", + "borderRight": "border-right", + "borderRightColor": "border-right-color", + "borderRightStyle": "border-right-style", + "borderRightWidth": "border-right-width", + "borderSpacing": "border-spacing", + "borderStyle": "border-style", + "borderTop": "border-top", + "borderTopColor": "border-top-color", + "borderTopLeftRadius": "border-top-left-radius", + "borderTopRightRadius": "border-top-right-radius", + "borderTopStyle": "border-top-style", + "borderTopWidth": "border-top-width", + "borderWidth": "border-width", + "bottom": "bottom", + "boxShadow": "box-shadow", + "boxSizing": "box-sizing", + "captionSide": "caption-side", + "clear": "clear", + "clip": "clip", + "clipPath": "clip-path", + "clipRule": "clip-rule", + "color": "color", + "colorInterpolation": "color-interpolation", + "colorInterpolationFilters": "color-interpolation-filters", + "colorProfile": "color-profile", + "colorRendering": "color-rendering", + "content": "content", + "counterIncrement": "counter-increment", + "counterReset": "counter-reset", + "cursor": "cursor", + "direction": "direction", + "display": "display", + "dominantBaseline": "dominant-baseline", + "emptyCells": "empty-cells", + "enableBackground": "enable-background", + "fill": "fill", + "fillOpacity": "fill-opacity", + "fillRule": "fill-rule", + "filter": "filter", + "cssFloat": "float", + "floodColor": "flood-color", + "floodOpacity": "flood-opacity", + "font": "font", + "fontFamily": "font-family", + "fontSize": "font-size", + "fontStretch": "font-stretch", + "fontStyle": "font-style", + "fontVariant": "font-variant", + "fontWeight": "font-weight", + "glyphOrientationHorizontal": "glyph-orientation-horizontal", + "glyphOrientationVertical": "glyph-orientation-vertical", + "height": "height", + "imageRendering": "image-rendering", + "kerning": "kerning", + "left": "left", + "letterSpacing": "letter-spacing", + "lightingColor": "lighting-color", + "lineHeight": "line-height", + "listStyle": "list-style", + "listStyleImage": "list-style-image", + "listStylePosition": "list-style-position", + "listStyleType": "list-style-type", + "margin": "margin", + "marginBottom": "margin-bottom", + "marginLeft": "margin-left", + "marginRight": "margin-right", + "marginTop": "margin-top", + "marker": "marker", + "markerEnd": "marker-end", + "markerMid": "marker-mid", + "markerStart": "marker-start", + "mask": "mask", + "maxHeight": "max-height", + "maxWidth": "max-width", + "minHeight": "min-height", + "minWidth": "min-width", + "opacity": "opacity", + "orphans": "orphans", + "outline": "outline", + "outlineColor": "outline-color", + "outlineOffset": "outline-offset", + "outlineStyle": "outline-style", + "outlineWidth": "outline-width", + "overflow": "overflow", + "overflowX": "overflow-x", + "overflowY": "overflow-y", + "padding": "padding", + "paddingBottom": "padding-bottom", + "paddingLeft": "padding-left", + "paddingRight": "padding-right", + "paddingTop": "padding-top", + "page": "page", + "pageBreakAfter": "page-break-after", + "pageBreakBefore": "page-break-before", + "pageBreakInside": "page-break-inside", + "pointerEvents": "pointer-events", + "position": "position", + "quotes": "quotes", + "resize": "resize", + "right": "right", + "shapeRendering": "shape-rendering", + "size": "size", + "speak": "speak", + "src": "src", + "stopColor": "stop-color", + "stopOpacity": "stop-opacity", + "stroke": "stroke", + "strokeDasharray": "stroke-dasharray", + "strokeDashoffset": "stroke-dashoffset", + "strokeLinecap": "stroke-linecap", + "strokeLinejoin": "stroke-linejoin", + "strokeMiterlimit": "stroke-miterlimit", + "strokeOpacity": "stroke-opacity", + "strokeWidth": "stroke-width", + "tableLayout": "table-layout", + "textAlign": "text-align", + "textAnchor": "text-anchor", + "textDecoration": "text-decoration", + "textIndent": "text-indent", + "textLineThrough": "text-line-through", + "textLineThroughColor": "text-line-through-color", + "textLineThroughMode": "text-line-through-mode", + "textLineThroughStyle": "text-line-through-style", + "textLineThroughWidth": "text-line-through-width", + "textOverflow": "text-overflow", + "textOverline": "text-overline", + "textOverlineColor": "text-overline-color", + "textOverlineMode": "text-overline-mode", + "textOverlineStyle": "text-overline-style", + "textOverlineWidth": "text-overline-width", + "textRendering": "text-rendering", + "textShadow": "text-shadow", + "textTransform": "text-transform", + "textUnderline": "text-underline", + "textUnderlineColor": "text-underline-color", + "textUnderlineMode": "text-underline-mode", + "textUnderlineStyle": "text-underline-style", + "textUnderlineWidth": "text-underline-width", + "top": "top", + "unicodeBidi": "unicode-bidi", + "unicodeRange": "unicode-range", + "vectorEffect": "vector-effect", + "verticalAlign": "vertical-align", + "visibility": "visibility", + "whiteSpace": "white-space", + "widows": "widows", + "width": "width", + "wordBreak": "word-break", + "wordSpacing": "word-spacing", + "wordWrap": "word-wrap", + "writingMode": "writing-mode", + "zIndex": "z-index", + "zoom": "zoom", + }; + + // Elements that can be self-closing + var voidElems = { + "area": true, + "base": true, + "br": true, + "col": true, + "command": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "link": true, + "meta": true, + "param": true, + "source": true, + "wbr": true + }; + + var whitespace = [" ", "\t", "\n", "\r"]; + + // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + var nodeTypes = { + ELEMENT_NODE: 1, + ATTRIBUTE_NODE: 2, + TEXT_NODE: 3, + CDATA_SECTION_NODE: 4, + ENTITY_REFERENCE_NODE: 5, + ENTITY_NODE: 6, + PROCESSING_INSTRUCTION_NODE: 7, + COMMENT_NODE: 8, + DOCUMENT_NODE: 9, + DOCUMENT_TYPE_NODE: 10, + DOCUMENT_FRAGMENT_NODE: 11, + NOTATION_NODE: 12 + }; + + function getElementsByTagName(tag) { + tag = tag.toUpperCase(); + var elems = []; + var allTags = (tag === "*"); + function getElems(node) { + var length = node.children.length; + for (var i = 0; i < length; i++) { + var child = node.children[i]; + if (allTags || (child.tagName === tag)) + elems.push(child); + getElems(child); + } + } + getElems(this); + elems._isLiveNodeList = true; + return elems; + } + + var Node = function () {}; + + Node.prototype = { + attributes: null, + childNodes: null, + localName: null, + nodeName: null, + parentNode: null, + textContent: null, + nextSibling: null, + previousSibling: null, + + get firstChild() { + return this.childNodes[0] || null; + }, + + get firstElementChild() { + return this.children[0] || null; + }, + + get lastChild() { + return this.childNodes[this.childNodes.length - 1] || null; + }, + + get lastElementChild() { + return this.children[this.children.length - 1] || null; + }, + + appendChild: function (child) { + if (child.parentNode) { + child.parentNode.removeChild(child); + } + + var last = this.lastChild; + if (last) + last.nextSibling = child; + child.previousSibling = last; + + if (child.nodeType === Node.ELEMENT_NODE) { + child.previousElementSibling = this.children[this.children.length - 1] || null; + this.children.push(child); + child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child); + } + this.childNodes.push(child); + child.parentNode = this; + }, + + removeChild: function (child) { + var childNodes = this.childNodes; + var childIndex = childNodes.indexOf(child); + if (childIndex === -1) { + throw "removeChild: node not found"; + } else { + child.parentNode = null; + var prev = child.previousSibling; + var next = child.nextSibling; + if (prev) + prev.nextSibling = next; + if (next) + next.previousSibling = prev; + + if (child.nodeType === Node.ELEMENT_NODE) { + prev = child.previousElementSibling; + next = child.nextElementSibling; + if (prev) + prev.nextElementSibling = next; + if (next) + next.previousElementSibling = prev; + this.children.splice(this.children.indexOf(child), 1); + } + + child.previousSibling = child.nextSibling = null; + child.previousElementSibling = child.nextElementSibling = null; + + return childNodes.splice(childIndex, 1)[0]; + } + }, + + replaceChild: function (newNode, oldNode) { + var childNodes = this.childNodes; + var childIndex = childNodes.indexOf(oldNode); + if (childIndex === -1) { + throw "replaceChild: node not found"; + } else { + // This will take care of updating the new node if it was somewhere else before: + if (newNode.parentNode) + newNode.parentNode.removeChild(newNode); + + childNodes[childIndex] = newNode; + + // update the new node's sibling properties, and its new siblings' sibling properties + newNode.nextSibling = oldNode.nextSibling; + newNode.previousSibling = oldNode.previousSibling; + if (newNode.nextSibling) + newNode.nextSibling.previousSibling = newNode; + if (newNode.previousSibling) + newNode.previousSibling.nextSibling = newNode; + + newNode.parentNode = this; + + // Now deal with elements before we clear out those values for the old node, + // because it can help us take shortcuts here: + if (newNode.nodeType === Node.ELEMENT_NODE) { + if (oldNode.nodeType === Node.ELEMENT_NODE) { + // Both were elements, which makes this easier, we just swap things out: + newNode.previousElementSibling = oldNode.previousElementSibling; + newNode.nextElementSibling = oldNode.nextElementSibling; + if (newNode.previousElementSibling) + newNode.previousElementSibling.nextElementSibling = newNode; + if (newNode.nextElementSibling) + newNode.nextElementSibling.previousElementSibling = newNode; + this.children[this.children.indexOf(oldNode)] = newNode; + } else { + // Hard way: + newNode.previousElementSibling = (function() { + for (var i = childIndex - 1; i >= 0; i--) { + if (childNodes[i].nodeType === Node.ELEMENT_NODE) + return childNodes[i]; + } + return null; + })(); + if (newNode.previousElementSibling) { + newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling; + } else { + newNode.nextElementSibling = (function() { + for (var i = childIndex + 1; i < childNodes.length; i++) { + if (childNodes[i].nodeType === Node.ELEMENT_NODE) + return childNodes[i]; + } + return null; + })(); + } + if (newNode.previousElementSibling) + newNode.previousElementSibling.nextElementSibling = newNode; + if (newNode.nextElementSibling) + newNode.nextElementSibling.previousElementSibling = newNode; + + if (newNode.nextElementSibling) + this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode); + else + this.children.push(newNode); + } + } else if (oldNode.nodeType === Node.ELEMENT_NODE) { + // new node is not an element node. + // if the old one was, update its element siblings: + if (oldNode.previousElementSibling) + oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling; + if (oldNode.nextElementSibling) + oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling; + this.children.splice(this.children.indexOf(oldNode), 1); + + // If the old node wasn't an element, neither the new nor the old node was an element, + // and the children array and its members shouldn't need any updating. + } + + + oldNode.parentNode = null; + oldNode.previousSibling = null; + oldNode.nextSibling = null; + if (oldNode.nodeType === Node.ELEMENT_NODE) { + oldNode.previousElementSibling = null; + oldNode.nextElementSibling = null; + } + return oldNode; + } + }, + + __JSDOMParser__: true, + }; + + for (var nodeType in nodeTypes) { + Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType]; + } + + var Attribute = function (name, value) { + this.name = name; + this._value = value; + }; + + Attribute.prototype = { + get value() { + return this._value; + }, + setValue: function(newValue) { + this._value = newValue; + }, + getEncodedValue: function() { + return encodeHTML(this._value); + }, + }; + + var Comment = function () { + this.childNodes = []; + }; + + Comment.prototype = { + __proto__: Node.prototype, + + nodeName: "#comment", + nodeType: Node.COMMENT_NODE + }; + + var Text = function () { + this.childNodes = []; + }; + + Text.prototype = { + __proto__: Node.prototype, + + nodeName: "#text", + nodeType: Node.TEXT_NODE, + get textContent() { + if (typeof this._textContent === "undefined") { + this._textContent = decodeHTML(this._innerHTML || ""); + } + return this._textContent; + }, + get innerHTML() { + if (typeof this._innerHTML === "undefined") { + this._innerHTML = encodeTextContentHTML(this._textContent || ""); + } + return this._innerHTML; + }, + + set innerHTML(newHTML) { + this._innerHTML = newHTML; + delete this._textContent; + }, + set textContent(newText) { + this._textContent = newText; + delete this._innerHTML; + }, + }; + + var Document = function (url) { + this.documentURI = url; + this.styleSheets = []; + this.childNodes = []; + this.children = []; + }; + + Document.prototype = { + __proto__: Node.prototype, + + nodeName: "#document", + nodeType: Node.DOCUMENT_NODE, + title: "", + + getElementsByTagName: getElementsByTagName, + + getElementById: function (id) { + function getElem(node) { + var length = node.children.length; + if (node.id === id) + return node; + for (var i = 0; i < length; i++) { + var el = getElem(node.children[i]); + if (el) + return el; + } + return null; + } + return getElem(this); + }, + + createElement: function (tag) { + var node = new Element(tag); + return node; + }, + + createTextNode: function (text) { + var node = new Text(); + node.textContent = text; + return node; + }, + + get baseURI() { + if (!this.hasOwnProperty("_baseURI")) { + this._baseURI = this.documentURI; + var baseElements = this.getElementsByTagName("base"); + var href = baseElements[0] && baseElements[0].getAttribute("href"); + if (href) { + try { + this._baseURI = (new URL(href, this._baseURI)).href; + } catch (ex) {/* Just fall back to documentURI */} + } + } + return this._baseURI; + }, + }; + + var Element = function (tag) { + // We use this to find the closing tag. + this._matchingTag = tag; + // We're explicitly a non-namespace aware parser, we just pretend it's all HTML. + var lastColonIndex = tag.lastIndexOf(":"); + if (lastColonIndex != -1) { + tag = tag.substring(lastColonIndex + 1); + } + this.attributes = []; + this.childNodes = []; + this.children = []; + this.nextElementSibling = this.previousElementSibling = null; + this.localName = tag.toLowerCase(); + this.tagName = tag.toUpperCase(); + this.style = new Style(this); + }; + + Element.prototype = { + __proto__: Node.prototype, + + nodeType: Node.ELEMENT_NODE, + + getElementsByTagName: getElementsByTagName, + + get className() { + return this.getAttribute("class") || ""; + }, + + set className(str) { + this.setAttribute("class", str); + }, + + get id() { + return this.getAttribute("id") || ""; + }, + + set id(str) { + this.setAttribute("id", str); + }, + + get href() { + return this.getAttribute("href") || ""; + }, + + set href(str) { + this.setAttribute("href", str); + }, + + get src() { + return this.getAttribute("src") || ""; + }, + + set src(str) { + this.setAttribute("src", str); + }, + + get srcset() { + return this.getAttribute("srcset") || ""; + }, + + set srcset(str) { + this.setAttribute("srcset", str); + }, + + get nodeName() { + return this.tagName; + }, + + get innerHTML() { + function getHTML(node) { + var i = 0; + for (i = 0; i < node.childNodes.length; i++) { + var child = node.childNodes[i]; + if (child.localName) { + arr.push("<" + child.localName); + + // serialize attribute list + for (var j = 0; j < child.attributes.length; j++) { + var attr = child.attributes[j]; + // the attribute value will be HTML escaped. + var val = attr.getEncodedValue(); + var quote = (val.indexOf('"') === -1 ? '"' : "'"); + arr.push(" " + attr.name + "=" + quote + val + quote); + } + + if (child.localName in voidElems && !child.childNodes.length) { + // if this is a self-closing element, end it here + arr.push("/>"); + } else { + // otherwise, add its children + arr.push(">"); + getHTML(child); + arr.push(""); + } + } else { + // This is a text node, so asking for innerHTML won't recurse. + arr.push(child.innerHTML); + } + } + } + + // Using Array.join() avoids the overhead from lazy string concatenation. + var arr = []; + getHTML(this); + return arr.join(""); + }, + + set innerHTML(html) { + var parser = new JSDOMParser(); + var node = parser.parse(html); + var i; + for (i = this.childNodes.length; --i >= 0;) { + this.childNodes[i].parentNode = null; + } + this.childNodes = node.childNodes; + this.children = node.children; + for (i = this.childNodes.length; --i >= 0;) { + this.childNodes[i].parentNode = this; + } + }, + + set textContent(text) { + // clear parentNodes for existing children + for (var i = this.childNodes.length; --i >= 0;) { + this.childNodes[i].parentNode = null; + } + + var node = new Text(); + this.childNodes = [ node ]; + this.children = []; + node.textContent = text; + node.parentNode = this; + }, + + get textContent() { + function getText(node) { + var nodes = node.childNodes; + for (var i = 0; i < nodes.length; i++) { + var child = nodes[i]; + if (child.nodeType === 3) { + text.push(child.textContent); + } else { + getText(child); + } + } + } + + // Using Array.join() avoids the overhead from lazy string concatenation. + // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes + var text = []; + getText(this); + return text.join(""); + }, + + getAttribute: function (name) { + for (var i = this.attributes.length; --i >= 0;) { + var attr = this.attributes[i]; + if (attr.name === name) { + return attr.value; + } + } + return undefined; + }, + + setAttribute: function (name, value) { + for (var i = this.attributes.length; --i >= 0;) { + var attr = this.attributes[i]; + if (attr.name === name) { + attr.setValue(value); + return; + } + } + this.attributes.push(new Attribute(name, value)); + }, + + removeAttribute: function (name) { + for (var i = this.attributes.length; --i >= 0;) { + var attr = this.attributes[i]; + if (attr.name === name) { + this.attributes.splice(i, 1); + break; + } + } + }, + + hasAttribute: function (name) { + return this.attributes.some(function (attr) { + return attr.name == name; + }); + }, + }; + + var Style = function (node) { + this.node = node; + }; + + // getStyle() and setStyle() use the style attribute string directly. This + // won't be very efficient if there are a lot of style manipulations, but + // it's the easiest way to make sure the style attribute string and the JS + // style property stay in sync. Readability.js doesn't do many style + // manipulations, so this should be okay. + Style.prototype = { + getStyle: function (styleName) { + var attr = this.node.getAttribute("style"); + if (!attr) + return undefined; + + var styles = attr.split(";"); + for (var i = 0; i < styles.length; i++) { + var style = styles[i].split(":"); + var name = style[0].trim(); + if (name === styleName) + return style[1].trim(); + } + + return undefined; + }, + + setStyle: function (styleName, styleValue) { + var value = this.node.getAttribute("style") || ""; + var index = 0; + do { + var next = value.indexOf(";", index) + 1; + var length = next - index - 1; + var style = (length > 0 ? value.substr(index, length) : value.substr(index)); + if (style.substr(0, style.indexOf(":")).trim() === styleName) { + value = value.substr(0, index).trim() + (next ? " " + value.substr(next).trim() : ""); + break; + } + index = next; + } while (index); + + value += " " + styleName + ": " + styleValue + ";"; + this.node.setAttribute("style", value.trim()); + } + }; + + // For each item in styleMap, define a getter and setter on the style + // property. + for (var jsName in styleMap) { + (function (cssName) { + Style.prototype.__defineGetter__(jsName, function () { + return this.getStyle(cssName); + }); + Style.prototype.__defineSetter__(jsName, function (value) { + this.setStyle(cssName, value); + }); + })(styleMap[jsName]); + } + + var JSDOMParser = function () { + this.currentChar = 0; + + // In makeElementNode() we build up many strings one char at a time. Using + // += for this results in lots of short-lived intermediate strings. It's + // better to build an array of single-char strings and then join() them + // together at the end. And reusing a single array (i.e. |this.strBuf|) + // over and over for this purpose uses less memory than using a new array + // for each string. + this.strBuf = []; + + // Similarly, we reuse this array to return the two arguments from + // makeElementNode(), which saves us from having to allocate a new array + // every time. + this.retPair = []; + + this.errorState = ""; + }; + + JSDOMParser.prototype = { + error: function(m) { + if (typeof dump !== "undefined") { + dump("JSDOMParser error: " + m + "\n"); + } else if (typeof console !== "undefined") { + console.log("JSDOMParser error: " + m + "\n"); + } + this.errorState += m + "\n"; + }, + + /** + * Look at the next character without advancing the index. + */ + peekNext: function () { + return this.html[this.currentChar]; + }, + + /** + * Get the next character and advance the index. + */ + nextChar: function () { + return this.html[this.currentChar++]; + }, + + /** + * Called after a quote character is read. This finds the next quote + * character and returns the text string in between. + */ + readString: function (quote) { + var str; + var n = this.html.indexOf(quote, this.currentChar); + if (n === -1) { + this.currentChar = this.html.length; + str = null; + } else { + str = this.html.substring(this.currentChar, n); + this.currentChar = n + 1; + } + + return str; + }, + + /** + * Called when parsing a node. This finds the next name/value attribute + * pair and adds the result to the attributes list. + */ + readAttribute: function (node) { + var name = ""; + + var n = this.html.indexOf("=", this.currentChar); + if (n === -1) { + this.currentChar = this.html.length; + } else { + // Read until a '=' character is hit; this will be the attribute key + name = this.html.substring(this.currentChar, n); + this.currentChar = n + 1; + } + + if (!name) + return; + + // After a '=', we should see a '"' for the attribute value + var c = this.nextChar(); + if (c !== '"' && c !== "'") { + this.error("Error reading attribute " + name + ", expecting '\"'"); + return; + } + + // Read the attribute value (and consume the matching quote) + var value = this.readString(c); + + node.attributes.push(new Attribute(name, decodeHTML(value))); + + return; + }, + + /** + * Parses and returns an Element node. This is called after a '<' has been + * read. + * + * @returns an array; the first index of the array is the parsed node; + * the second index is a boolean indicating whether this is a void + * Element + */ + makeElementNode: function (retPair) { + var c = this.nextChar(); + + // Read the Element tag name + var strBuf = this.strBuf; + strBuf.length = 0; + while (whitespace.indexOf(c) == -1 && c !== ">" && c !== "/") { + if (c === undefined) + return false; + strBuf.push(c); + c = this.nextChar(); + } + var tag = strBuf.join(""); + + if (!tag) + return false; + + var node = new Element(tag); + + // Read Element attributes + while (c !== "/" && c !== ">") { + if (c === undefined) + return false; + while (whitespace.indexOf(this.html[this.currentChar++]) != -1) { + // Advance cursor to first non-whitespace char. + } + this.currentChar--; + c = this.nextChar(); + if (c !== "/" && c !== ">") { + --this.currentChar; + this.readAttribute(node); + } + } + + // If this is a self-closing tag, read '/>' + var closed = false; + if (c === "/") { + closed = true; + c = this.nextChar(); + if (c !== ">") { + this.error("expected '>' to close " + tag); + return false; + } + } + + retPair[0] = node; + retPair[1] = closed; + return true; + }, + + /** + * If the current input matches this string, advance the input index; + * otherwise, do nothing. + * + * @returns whether input matched string + */ + match: function (str) { + var strlen = str.length; + if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) { + this.currentChar += strlen; + return true; + } + return false; + }, + + /** + * Searches the input until a string is found and discards all input up to + * and including the matched string. + */ + discardTo: function (str) { + var index = this.html.indexOf(str, this.currentChar) + str.length; + if (index === -1) + this.currentChar = this.html.length; + this.currentChar = index; + }, + + /** + * Reads child nodes for the given node. + */ + readChildren: function (node) { + var child; + while ((child = this.readNode())) { + // Don't keep Comment nodes + if (child.nodeType !== 8) { + node.appendChild(child); + } + } + }, + + discardNextComment: function() { + if (this.match("--")) { + this.discardTo("-->"); + } else { + var c = this.nextChar(); + while (c !== ">") { + if (c === undefined) + return null; + if (c === '"' || c === "'") + this.readString(c); + c = this.nextChar(); + } + } + return new Comment(); + }, + + + /** + * Reads the next child node from the input. If we're reading a closing + * tag, or if we've reached the end of input, return null. + * + * @returns the node + */ + readNode: function () { + var c = this.nextChar(); + + if (c === undefined) + return null; + + // Read any text as Text node + var textNode; + if (c !== "<") { + --this.currentChar; + textNode = new Text(); + var n = this.html.indexOf("<", this.currentChar); + if (n === -1) { + textNode.innerHTML = this.html.substring(this.currentChar, this.html.length); + this.currentChar = this.html.length; + } else { + textNode.innerHTML = this.html.substring(this.currentChar, n); + this.currentChar = n; + } + return textNode; + } + + if (this.match("![CDATA[")) { + var endChar = this.html.indexOf("]]>", this.currentChar); + if (endChar === -1) { + this.error("unclosed CDATA section"); + return null; + } + textNode = new Text(); + textNode.textContent = this.html.substring(this.currentChar, endChar); + this.currentChar = endChar + ("]]>").length; + return textNode; + } + + c = this.peekNext(); + + // Read Comment node. Normally, Comment nodes know their inner + // textContent, but we don't really care about Comment nodes (we throw + // them away in readChildren()). So just returning an empty Comment node + // here is sufficient. + if (c === "!" || c === "?") { + // We're still before the ! or ? that is starting this comment: + this.currentChar++; + return this.discardNextComment(); + } + + // If we're reading a closing tag, return null. This means we've reached + // the end of this set of child nodes. + if (c === "/") { + --this.currentChar; + return null; + } + + // Otherwise, we're looking at an Element node + var result = this.makeElementNode(this.retPair); + if (!result) + return null; + + var node = this.retPair[0]; + var closed = this.retPair[1]; + var localName = node.localName; + + // If this isn't a void Element, read its child nodes + if (!closed) { + this.readChildren(node); + var closingTag = ""; + if (!this.match(closingTag)) { + this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length)); + return null; + } + } + + // Only use the first title, because SVG might have other + // title elements which we don't care about (medium.com + // does this, at least). + if (localName === "title" && !this.doc.title) { + this.doc.title = node.textContent.trim(); + } else if (localName === "head") { + this.doc.head = node; + } else if (localName === "body") { + this.doc.body = node; + } else if (localName === "html") { + this.doc.documentElement = node; + } + + return node; + }, + + /** + * Parses an HTML string and returns a JS implementation of the Document. + */ + parse: function (html, url) { + this.html = html; + var doc = this.doc = new Document(url); + this.readChildren(doc); + + // If this is an HTML document, remove root-level children except for the + // node + if (doc.documentElement) { + for (var i = doc.childNodes.length; --i >= 0;) { + var child = doc.childNodes[i]; + if (child !== doc.documentElement) { + doc.removeChild(child); + } + } + } + + return doc; + } + }; + + // Attach the standard DOM types to the global scope + global.Node = Node; + global.Comment = Comment; + global.Document = Document; + global.Element = Element; + global.Text = Text; + + // Attach JSDOMParser to the global scope + global.JSDOMParser = JSDOMParser; + +})(this); + +if (typeof module === "object") { + module.exports = this.JSDOMParser; +} diff --git a/apps/web-clipper/lib/Readability-readerable.js b/apps/web-clipper/lib/Readability-readerable.js new file mode 100644 index 000000000..64be5e15e --- /dev/null +++ b/apps/web-clipper/lib/Readability-readerable.js @@ -0,0 +1,108 @@ +/* eslint-env es6:false */ +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +var REGEXPS = { + // NOTE: These two regular expressions are duplicated in + // Readability.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, +}; + +function isNodeVisible(node) { + // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. + return (!node.style || node.style.display != "none") + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); +} + +/** + * Decides whether or not the document is reader-able without parsing the whole thing. + * @param {Object} options Configuration object. + * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable. + * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable. + * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible. + * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object. + */ +function isProbablyReaderable(doc, options = {}) { + // For backward compatibility reasons 'options' can either be a configuration object or the function used + // to determine if a node is visible. + if (typeof options == "function") { + options = { visibilityChecker: options }; + } + + var defaultOptions = { minScore: 20, minContentLength: 140, visibilityChecker: isNodeVisible }; + options = Object.assign(defaultOptions, options); + + var nodes = doc.querySelectorAll("p, pre, article"); + + // Get
nodes which have
node(s) and append them into the `nodes` variable. + // Some articles' DOM structures might look like + //
+ // Sentences
+ //
+ // Sentences
+ //
+ var brNodes = doc.querySelectorAll("div > br"); + if (brNodes.length) { + var set = new Set(nodes); + [].forEach.call(brNodes, function (node) { + set.add(node.parentNode); + }); + nodes = Array.from(set); + } + + var score = 0; + // This is a little cheeky, we use the accumulator 'score' to decide what to return from + // this callback: + return [].some.call(nodes, function (node) { + if (!options.visibilityChecker(node)) { + return false; + } + + var matchString = node.className + " " + node.id; + if (REGEXPS.unlikelyCandidates.test(matchString) && + !REGEXPS.okMaybeItsACandidate.test(matchString)) { + return false; + } + + if (node.matches("li p")) { + return false; + } + + var textContentLength = node.textContent.trim().length; + if (textContentLength < options.minContentLength) { + return false; + } + + score += Math.sqrt(textContentLength - options.minContentLength); + + if (score > options.minScore) { + return true; + } + return false; + }); +} + +if (typeof module === "object") { + module.exports = isProbablyReaderable; +} diff --git a/apps/web-clipper/lib/Readability.js b/apps/web-clipper/lib/Readability.js new file mode 100644 index 000000000..ce06df459 --- /dev/null +++ b/apps/web-clipper/lib/Readability.js @@ -0,0 +1,2283 @@ +/*eslint-env es6:false*/ +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +/** + * Public constructor. + * @param {HTMLDocument} doc The document to parse. + * @param {Object} options The options object. + */ +function Readability(doc, options) { + // In some older versions, people passed a URI as the first argument. Cope: + if (options && options.documentElement) { + doc = options; + options = arguments[2]; + } else if (!doc || !doc.documentElement) { + throw new Error("First argument to Readability constructor should be a document object."); + } + options = options || {}; + + this._doc = doc; + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; + this._articleTitle = null; + this._articleByline = null; + this._articleDir = null; + this._articleSiteName = null; + this._attempts = []; + + // Configurable options + this._debug = !!options.debug; + this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; + this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; + this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._keepClasses = !!options.keepClasses; + this._serializer = options.serializer || function(el) { + return el.innerHTML; + }; + this._disableJSONLD = !!options.disableJSONLD; + + // Start with all flags set + this._flags = this.FLAG_STRIP_UNLIKELYS | + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; + + + // Control whether log messages are sent to the console + if (this._debug) { + let logNode = function(node) { + if (node.nodeType == node.TEXT_NODE) { + return `${node.nodeName} ("${node.textContent}")`; + } + let attrPairs = Array.from(node.attributes || [], function(attr) { + return `${attr.name}="${attr.value}"`; + }).join(" "); + return `<${node.localName} ${attrPairs}>`; + }; + this.log = function () { + if (typeof dump !== "undefined") { + var msg = Array.prototype.map.call(arguments, function(x) { + return (x && x.nodeName) ? logNode(x) : x; + }).join(" "); + dump("Reader: (Readability) " + msg + "\n"); + } else if (typeof console !== "undefined") { + let args = Array.from(arguments, arg => { + if (arg && arg.nodeType == this.ELEMENT_NODE) { + return logNode(arg); + } + return arg; + }); + args.unshift("Reader: (Readability)"); + console.log.apply(console, args); + } + }; + } else { + this.log = function () {}; + } +} + +Readability.prototype = { + FLAG_STRIP_UNLIKELYS: 0x1, + FLAG_WEIGHT_CLASSES: 0x2, + FLAG_CLEAN_CONDITIONALLY: 0x4, + + // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + ELEMENT_NODE: 1, + TEXT_NODE: 3, + + // Max number of nodes supported by this parser. Default: 0 (no limit) + DEFAULT_MAX_ELEMS_TO_PARSE: 0, + + // The number of top candidates to consider when analysing how + // tight the competition is among candidates. + DEFAULT_N_TOP_CANDIDATES: 5, + + // Element tags to score by default. + DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), + + // The default number of chars an article must have in order to return a result + DEFAULT_CHAR_THRESHOLD: 500, + + // All of the regular expressions in use within readability. + // Defined up here so we don't instantiate them repeatedly in loops. + REGEXPS: { + // NOTE: These two regular expressions are duplicated in + // Readability-readerable.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, + + positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, + negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, + byline: /byline|author|dateline|writtenby|p-author/i, + replaceFonts: /<(\/?)font[^>]*>/gi, + normalize: /\s{2,}/g, + videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, + nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, + prevLink: /(prev|earl|old|new|<|«)/i, + tokenize: /\W+/g, + whitespace: /^\s*$/, + hasContent: /\S$/, + hashUrl: /^#.+/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ + }, + + UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + + DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), + + ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], + + PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], + + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. + PHRASING_ELEMS: [ + // "CANVAS", "IFRAME", "SVG", "VIDEO", + "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", + "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", + "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", + "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", + "SUP", "TEXTAREA", "TIME", "VAR", "WBR" + ], + + // These are the classes that readability sets itself. + CLASSES_TO_PRESERVE: [ "page" ], + + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + + /** + * Run any post-process modifications to article content as necessary. + * + * @param Element + * @return void + **/ + _postProcessContent: function(articleContent) { + // Readability cannot open relative uris so we convert them to absolute uris. + this._fixRelativeUris(articleContent); + + this._simplifyNestedElements(articleContent); + + if (!this._keepClasses) { + // Remove classes. + this._cleanClasses(articleContent); + } + }, + + /** + * Iterates over a NodeList, calls `filterFn` for each node and removes node + * if function returned `true`. + * + * If function is not passed, removes all the nodes in node list. + * + * @param NodeList nodeList The nodes to operate on + * @param Function filterFn the function to use as a filter + * @return void + */ + _removeNodes: function(nodeList, filterFn) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _removeNodes"); + } + for (var i = nodeList.length - 1; i >= 0; i--) { + var node = nodeList[i]; + var parentNode = node.parentNode; + if (parentNode) { + if (!filterFn || filterFn.call(this, node, i, nodeList)) { + parentNode.removeChild(node); + } + } + } + }, + + /** + * Iterates over a NodeList, and calls _setNodeTag for each node. + * + * @param NodeList nodeList The nodes to operate on + * @param String newTagName the new tag name to use + * @return void + */ + _replaceNodeTags: function(nodeList, newTagName) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _replaceNodeTags"); + } + for (const node of nodeList) { + this._setNodeTag(node, newTagName); + } + }, + + /** + * Iterate over a NodeList, which doesn't natively fully implement the Array + * interface. + * + * For convenience, the current object context is applied to the provided + * iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return void + */ + _forEachNode: function(nodeList, fn) { + Array.prototype.forEach.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if any of the provided iterate + * function calls returns true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _someNode: function(nodeList, fn) { + return Array.prototype.some.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if all of the provided iterate + * function calls return true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _everyNode: function(nodeList, fn) { + return Array.prototype.every.call(nodeList, fn, this); + }, + + /** + * Concat all nodelists passed as arguments. + * + * @return ...NodeList + * @return Array + */ + _concatNodeLists: function() { + var slice = Array.prototype.slice; + var args = slice.call(arguments); + var nodeLists = args.map(function(list) { + return slice.call(list); + }); + return Array.prototype.concat.apply([], nodeLists); + }, + + _getAllNodesWithTag: function(node, tagNames) { + if (node.querySelectorAll) { + return node.querySelectorAll(tagNames.join(",")); + } + return [].concat.apply([], tagNames.map(function(tag) { + var collection = node.getElementsByTagName(tag); + return Array.isArray(collection) ? collection : Array.from(collection); + })); + }, + + /** + * Removes the class="" attribute from every element in the given + * subtree, except those that match CLASSES_TO_PRESERVE and + * the classesToPreserve array from the options object. + * + * @param Element + * @return void + */ + _cleanClasses: function(node) { + var classesToPreserve = this._classesToPreserve; + var className = (node.getAttribute("class") || "") + .split(/\s+/) + .filter(function(cls) { + return classesToPreserve.indexOf(cls) != -1; + }) + .join(" "); + + if (className) { + node.setAttribute("class", className); + } else { + node.removeAttribute("class"); + } + + for (node = node.firstElementChild; node; node = node.nextElementSibling) { + this._cleanClasses(node); + } + }, + + /** + * Converts each and uri in the given element to an absolute URI, + * ignoring #ref URIs. + * + * @param Element + * @return void + */ + _fixRelativeUris: function(articleContent) { + var baseURI = this._doc.baseURI; + var documentURI = this._doc.documentURI; + function toAbsoluteURI(uri) { + // Leave hash links alone if the base URI matches the document URI: + if (baseURI == documentURI && uri.charAt(0) == "#") { + return uri; + } + + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; + } + + var links = this._getAllNodesWithTag(articleContent, ["a"]); + this._forEachNode(links, function(link) { + var href = link.getAttribute("href"); + if (href) { + // Remove links with javascript: URIs, since + // they won't work after scripts have been removed from the page. + if (href.indexOf("javascript:") === 0) { + // if the link only contains simple text content, it can be converted to a text node + if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + var text = this._doc.createTextNode(link.textContent); + link.parentNode.replaceChild(text, link); + } else { + // if the link has multiple children, they should all be preserved + var container = this._doc.createElement("span"); + while (link.firstChild) { + container.appendChild(link.firstChild); + } + link.parentNode.replaceChild(container, link); + } + } else { + link.setAttribute("href", toAbsoluteURI(href)); + } + } + }); + + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + + if (src) { + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); + } + }); + }, + + _simplifyNestedElements: function(articleContent) { + var node = articleContent; + + while (node) { + if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if (this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + var child = node.children[0]; + for (var i = 0; i < node.attributes.length; i++) { + child.setAttribute(node.attributes[i].name, node.attributes[i].value); + } + node.parentNode.replaceChild(child, node); + node = child; + continue; + } + } + + node = this._getNextNode(node); + } + }, + + /** + * Get the article title as an H1. + * + * @return string + **/ + _getArticleTitle: function() { + var doc = this._doc; + var curTitle = ""; + var origTitle = ""; + + try { + curTitle = origTitle = doc.title.trim(); + + // If they had an element with id "title" in their HTML + if (typeof curTitle !== "string") + curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); + } catch (e) {/* ignore exceptions setting the title. */} + + var titleHadHierarchicalSeparators = false; + function wordCount(str) { + return str.split(/\s+/).length; + } + + // If there's a separator in the title, first remove the final part + if ((/ [\|\-\\\/>»] /).test(curTitle)) { + titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); + curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if (wordCount(curTitle) < 3) + curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); + } else if (curTitle.indexOf(": ") !== -1) { + // Check if we have an heading containing this exact string, so we + // could assume it's the full title. + var headings = this._concatNodeLists( + doc.getElementsByTagName("h1"), + doc.getElementsByTagName("h2") + ); + var trimmedTitle = curTitle.trim(); + var match = this._someNode(headings, function(heading) { + return heading.textContent.trim() === trimmedTitle; + }); + + // If we don't, let's extract the title out of the original title string. + if (!match) { + curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); + + // If the title is now too short, try the first colon instead: + if (wordCount(curTitle) < 3) { + curTitle = origTitle.substring(origTitle.indexOf(":") + 1); + // But if we have too many words before the colon there's something weird + // with the titles and the H tags so let's just use the original title instead + } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { + curTitle = origTitle; + } + } + } else if (curTitle.length > 150 || curTitle.length < 15) { + var hOnes = doc.getElementsByTagName("h1"); + + if (hOnes.length === 1) + curTitle = this._getInnerText(hOnes[0]); + } + + curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); + // If we now have 4 words or fewer as our title, and either no + // 'hierarchical' separators (\, /, > or ») were found in the original + // title or we decreased the number of words by more than 1 word, use + // the original title. + var curTitleWordCount = wordCount(curTitle); + if (curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + curTitle = origTitle; + } + + return curTitle; + }, + + /** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + * + * @return void + **/ + _prepDocument: function() { + var doc = this._doc; + + // Remove all style tags in head + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); + + if (doc.body) { + this._replaceBrs(doc.body); + } + + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); + }, + + /** + * Finds the next node, starting from the given node, and ignoring + * whitespace in between. If the given node is an element, the same node is + * returned. + */ + _nextNode: function (node) { + var next = node; + while (next + && (next.nodeType != this.ELEMENT_NODE) + && this.REGEXPS.whitespace.test(next.textContent)) { + next = next.nextSibling; + } + return next; + }, + + /** + * Replaces 2 or more successive
elements with a single

. + * Whitespace between
elements are ignored. For example: + *

foo
bar


abc
+ * will become: + *
foo
bar

abc

+ */ + _replaceBrs: function (elem) { + this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { + var next = br.nextSibling; + + // Whether 2 or more
elements have been found and replaced with a + //

block. + var replaced = false; + + // If we find a
chain, remove the
s until we hit another node + // or non-whitespace. This leaves behind the first
in the chain + // (which will be replaced with a

later). + while ((next = this._nextNode(next)) && (next.tagName == "BR")) { + replaced = true; + var brSibling = next.nextSibling; + next.parentNode.removeChild(next); + next = brSibling; + } + + // If we removed a
chain, replace the remaining
with a

. Add + // all sibling nodes as children of the

until we hit another
+ // chain. + if (replaced) { + var p = this._doc.createElement("p"); + br.parentNode.replaceChild(p, br); + + next = p.nextSibling; + while (next) { + // If we've hit another

, we're done adding children to this

. + if (next.tagName == "BR") { + var nextElem = this._nextNode(next.nextSibling); + if (nextElem && nextElem.tagName == "BR") + break; + } + + if (!this._isPhrasingContent(next)) + break; + + // Otherwise, make this node a child of the new

. + var sibling = next.nextSibling; + p.appendChild(next); + next = sibling; + } + + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + + if (p.parentNode.tagName === "P") + this._setNodeTag(p.parentNode, "DIV"); + } + }); + }, + + _setNodeTag: function (node, tag) { + this.log("_setNodeTag", node, tag); + if (this._docJSDOMParser) { + node.localName = tag.toLowerCase(); + node.tagName = tag.toUpperCase(); + return node; + } + + var replacement = node.ownerDocument.createElement(tag); + while (node.firstChild) { + replacement.appendChild(node.firstChild); + } + node.parentNode.replaceChild(replacement, node); + if (node.readability) + replacement.readability = node.readability; + + for (var i = 0; i < node.attributes.length; i++) { + try { + replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + } catch (ex) { + /* it's possible for setAttribute() to throw if the attribute name + * isn't a valid XML Name. Such attributes can however be parsed from + * source in HTML docs, see https://github.com/whatwg/html/issues/4275, + * so we can hit them here and then throw. We don't care about such + * attributes so we ignore them. + */ + } + } + return replacement; + }, + + /** + * Prepare the article node for display. Clean out any inline styles, + * iframes, forms, strip extraneous

tags, etc. + * + * @param Element + * @return void + **/ + _prepArticle: function(articleContent) { + this._cleanStyles(articleContent); + + // Check for data tables before we continue, to avoid removing items in + // those tables, which will often be isolated even though they're + // visually linked to other content-ful elements (text, images, etc.). + this._markDataTables(articleContent); + + this._fixLazyImages(articleContent); + + // Clean out junk from the article content + this._cleanConditionally(articleContent, "form"); + this._cleanConditionally(articleContent, "fieldset"); + this._clean(articleContent, "object"); + this._clean(articleContent, "embed"); + this._clean(articleContent, "footer"); + this._clean(articleContent, "link"); + this._clean(articleContent, "aside"); + + // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, + // which means we don't remove the top candidates even they have "share". + + var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; + + this._forEachNode(articleContent.children, function (topCandidate) { + this._cleanMatchedNodes(topCandidate, function (node, matchString) { + return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + }); + }); + + this._clean(articleContent, "iframe"); + this._clean(articleContent, "input"); + this._clean(articleContent, "textarea"); + this._clean(articleContent, "select"); + this._clean(articleContent, "button"); + this._cleanHeaders(articleContent); + + // Do these last as the previous stuff may have removed junk + // that will affect these + this._cleanConditionally(articleContent, "table"); + this._cleanConditionally(articleContent, "ul"); + this._cleanConditionally(articleContent, "div"); + + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + + // Remove extra paragraphs + this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { + var imgCount = paragraph.getElementsByTagName("img").length; + var embedCount = paragraph.getElementsByTagName("embed").length; + var objectCount = paragraph.getElementsByTagName("object").length; + // At this point, nasty iframes have been removed, only remain embedded video ones. + var iframeCount = paragraph.getElementsByTagName("iframe").length; + var totalCount = imgCount + embedCount + objectCount + iframeCount; + + return totalCount === 0 && !this._getInnerText(paragraph, false); + }); + + this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { + var next = this._nextNode(br.nextSibling); + if (next && next.tagName == "P") + br.parentNode.removeChild(br); + }); + + // Remove single-cell tables + this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); + table.parentNode.replaceChild(cell, table); + } + } + }); + }, + + /** + * Initialize a node with the readability object. Also checks the + * className/id for special names to add to its score. + * + * @param Element + * @return void + **/ + _initializeNode: function(node) { + node.readability = {"contentScore": 0}; + + switch (node.tagName) { + case "DIV": + node.readability.contentScore += 5; + break; + + case "PRE": + case "TD": + case "BLOCKQUOTE": + node.readability.contentScore += 3; + break; + + case "ADDRESS": + case "OL": + case "UL": + case "DL": + case "DD": + case "DT": + case "LI": + case "FORM": + node.readability.contentScore -= 3; + break; + + case "H1": + case "H2": + case "H3": + case "H4": + case "H5": + case "H6": + case "TH": + node.readability.contentScore -= 5; + break; + } + + node.readability.contentScore += this._getClassWeight(node); + }, + + _removeAndGetNext: function(node) { + var nextNode = this._getNextNode(node, true); + node.parentNode.removeChild(node); + return nextNode; + }, + + /** + * Traverse the DOM from node to node, starting at the node passed in. + * Pass true for the second parameter to indicate this node itself + * (and its kids) are going away, and we want the next node over. + * + * Calling this in a loop will traverse the DOM depth-first. + */ + _getNextNode: function(node, ignoreSelfAndKids) { + // First check for kids if those aren't being ignored + if (!ignoreSelfAndKids && node.firstElementChild) { + return node.firstElementChild; + } + // Then for siblings... + if (node.nextElementSibling) { + return node.nextElementSibling; + } + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + do { + node = node.parentNode; + } while (node && !node.nextElementSibling); + return node && node.nextElementSibling; + }, + + // compares second text to first one + // 1 = same text, 0 = completely different text + // works the way that it splits both texts into words and then finds words that are unique in second text + // the result is given by the lower length of unique parts + _textSimilarity: function(textA, textB) { + var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + if (!tokensA.length || !tokensB.length) { + return 0; + } + var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; + return 1 - distanceB; + }, + + _checkByline: function(node, matchString) { + if (this._articleByline) { + return false; + } + + if (node.getAttribute !== undefined) { + var rel = node.getAttribute("rel"); + var itemprop = node.getAttribute("itemprop"); + } + + if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + this._articleByline = node.textContent.trim(); + return true; + } + + return false; + }, + + _getNodeAncestors: function(node, maxDepth) { + maxDepth = maxDepth || 0; + var i = 0, ancestors = []; + while (node.parentNode) { + ancestors.push(node.parentNode); + if (maxDepth && ++i === maxDepth) + break; + node = node.parentNode; + } + return ancestors; + }, + + /*** + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. + * + * @param page a document to run upon. Needs to be a full document, complete with body. + * @return Element + **/ + _grabArticle: function (page) { + this.log("**** grabArticle ****"); + var doc = this._doc; + var isPaging = page !== null; + page = page ? page : this._doc.body; + + // We can't grab an article if we don't have a page! + if (!page) { + this.log("No body found in document. Abort."); + return null; + } + + var pageCacheHtml = page.innerHTML; + + while (true) { + this.log("Starting grabArticle loop"); + var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); + + // First, node prepping. Trash nodes that look cruddy (like ones with the + // class name "comment", etc), and turn divs into P tags where they have been + // used inappropriately (as in, where they contain no other block level elements.) + var elementsToScore = []; + var node = this._doc.documentElement; + + let shouldRemoveTitleHeader = true; + + while (node) { + + if (node.tagName === "HTML") { + this._articleLang = node.getAttribute("lang"); + } + + var matchString = node.className + " " + node.id; + + if (!this._isProbablyVisible(node)) { + this.log("Removing hidden node - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + + // Check to see if this node is a byline, and remove it if it is. + if (this._checkByline(node, matchString)) { + node = this._removeAndGetNext(node); + continue; + } + + if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { + this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + shouldRemoveTitleHeader = false; + node = this._removeAndGetNext(node); + continue; + } + + // Remove unlikely candidates + if (stripUnlikelyCandidates) { + if (this.REGEXPS.unlikelyCandidates.test(matchString) && + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && + node.tagName !== "BODY" && + node.tagName !== "A") { + this.log("Removing unlikely candidate - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { + this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + } + + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || + node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || + node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && + this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } + + if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { + elementsToScore.push(node); + } + + // Turn all divs that don't have children block level elements into p's + if (node.tagName === "DIV") { + // Put phrasing content into paragraphs. + var p = null; + var childNode = node.firstChild; + while (childNode) { + var nextSibling = childNode.nextSibling; + if (this._isPhrasingContent(childNode)) { + if (p !== null) { + p.appendChild(childNode); + } else if (!this._isWhitespace(childNode)) { + p = doc.createElement("p"); + node.replaceChild(p, childNode); + p.appendChild(childNode); + } + } else if (p !== null) { + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + p = null; + } + childNode = nextSibling; + } + + // Sites like http://mobile.slate.com encloses each paragraph with a DIV + // element. DIVs with only a P element inside and no text content can be + // safely converted into plain P elements to avoid confusing the scoring + // algorithm with DIVs with are, in practice, paragraphs. + if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { + var newNode = node.children[0]; + node.parentNode.replaceChild(newNode, node); + node = newNode; + elementsToScore.push(node); + } else if (!this._hasChildBlockElement(node)) { + node = this._setNodeTag(node, "P"); + elementsToScore.push(node); + } + } + node = this._getNextNode(node); + } + + /** + * Loop through all paragraphs, and assign a score to them based on how content-y they look. + * Then add their score to their parent node. + * + * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. + **/ + var candidates = []; + this._forEachNode(elementsToScore, function(elementToScore) { + if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") + return; + + // If this paragraph is less than 25 characters, don't even count it. + var innerText = this._getInnerText(elementToScore); + if (innerText.length < 25) + return; + + // Exclude nodes with no ancestor. + var ancestors = this._getNodeAncestors(elementToScore, 5); + if (ancestors.length === 0) + return; + + var contentScore = 0; + + // Add a point for the paragraph itself as a base. + contentScore += 1; + + // Add points for any commas within this paragraph. + contentScore += innerText.split(",").length; + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += Math.min(Math.floor(innerText.length / 100), 3); + + // Initialize and score ancestors. + this._forEachNode(ancestors, function(ancestor, level) { + if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") + return; + + if (typeof(ancestor.readability) === "undefined") { + this._initializeNode(ancestor); + candidates.push(ancestor); + } + + // Node score divider: + // - parent: 1 (no division) + // - grandparent: 2 + // - great grandparent+: ancestor level * 3 + if (level === 0) + var scoreDivider = 1; + else if (level === 1) + scoreDivider = 2; + else + scoreDivider = level * 3; + ancestor.readability.contentScore += contentScore / scoreDivider; + }); + }); + + // After we've calculated scores, loop through all of the possible + // candidate nodes we found and find the one with the highest score. + var topCandidates = []; + for (var c = 0, cl = candidates.length; c < cl; c += 1) { + var candidate = candidates[c]; + + // Scale the final candidates score based on link density. Good content + // should have a relatively small link density (5% or less) and be mostly + // unaffected by this operation. + var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); + candidate.readability.contentScore = candidateScore; + + this.log("Candidate:", candidate, "with score " + candidateScore); + + for (var t = 0; t < this._nbTopCandidates; t++) { + var aTopCandidate = topCandidates[t]; + + if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { + topCandidates.splice(t, 0, candidate); + if (topCandidates.length > this._nbTopCandidates) + topCandidates.pop(); + break; + } + } + } + + var topCandidate = topCandidates[0] || null; + var neededToCreateTopCandidate = false; + var parentOfTopCandidate; + + // If we still have no top candidate, just use the body as a last resort. + // We also have to copy the body node so it is something we can modify. + if (topCandidate === null || topCandidate.tagName === "BODY") { + // Move all of the page's children into topCandidate + topCandidate = doc.createElement("DIV"); + neededToCreateTopCandidate = true; + // Move everything (not just elements, also text nodes etc.) into the container + // so we even include text directly in the body: + while (page.firstChild) { + this.log("Moving child out:", page.firstChild); + topCandidate.appendChild(page.firstChild); + } + + page.appendChild(topCandidate); + + this._initializeNode(topCandidate); + } else if (topCandidate) { + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array + // and whose scores are quite closed with current `topCandidate` node. + var alternativeCandidateAncestors = []; + for (var i = 1; i < topCandidates.length; i++) { + if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { + alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); + } + } + var MINIMUM_TOPCANDIDATES = 3; + if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName !== "BODY") { + var listsContainingThisAncestor = 0; + for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { + listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); + } + if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { + topCandidate = parentOfTopCandidate; + break; + } + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + + // Because of our bonus system, parents of candidates might have scores + // themselves. They get half of the node. There won't be nodes with higher + // scores than our topCandidate, but if we see the score going *up* in the first + // few steps up the tree, that's a decent sign that there might be more content + // lurking in other places that we want to unify in. The sibling stuff + // below does some of that - but only if we've looked high enough up the DOM + // tree. + parentOfTopCandidate = topCandidate.parentNode; + var lastScore = topCandidate.readability.contentScore; + // The scores shouldn't get too low. + var scoreThreshold = lastScore / 3; + while (parentOfTopCandidate.tagName !== "BODY") { + if (!parentOfTopCandidate.readability) { + parentOfTopCandidate = parentOfTopCandidate.parentNode; + continue; + } + var parentScore = parentOfTopCandidate.readability.contentScore; + if (parentScore < scoreThreshold) + break; + if (parentScore > lastScore) { + // Alright! We found a better parent to use. + topCandidate = parentOfTopCandidate; + break; + } + lastScore = parentOfTopCandidate.readability.contentScore; + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + + // If the top candidate is the only child, use parent instead. This will help sibling + // joining logic when adjacent content is actually located in parent's sibling node. + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { + topCandidate = parentOfTopCandidate; + parentOfTopCandidate = topCandidate.parentNode; + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + } + + // Now that we have the top candidate, look through its siblings for content + // that might also be related. Things like preambles, content split by ads + // that we removed, etc. + var articleContent = doc.createElement("DIV"); + if (isPaging) + articleContent.id = "readability-content"; + + var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); + // Keep potential top candidate's parent node to try to get text direction of it later. + parentOfTopCandidate = topCandidate.parentNode; + var siblings = parentOfTopCandidate.children; + + for (var s = 0, sl = siblings.length; s < sl; s++) { + var sibling = siblings[s]; + var append = false; + + this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); + this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); + + if (sibling === topCandidate) { + append = true; + } else { + var contentBonus = 0; + + // Give a bonus if sibling nodes and top candidates have the example same classname + if (sibling.className === topCandidate.className && topCandidate.className !== "") + contentBonus += topCandidate.readability.contentScore * 0.2; + + if (sibling.readability && + ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { + append = true; + } else if (sibling.nodeName === "P") { + var linkDensity = this._getLinkDensity(sibling); + var nodeContent = this._getInnerText(sibling); + var nodeLength = nodeContent.length; + + if (nodeLength > 80 && linkDensity < 0.25) { + append = true; + } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1) { + append = true; + } + } + } + + if (append) { + this.log("Appending node:", sibling); + + if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { + // We have a node that isn't a common block level element, like a form or td tag. + // Turn it into a div so it doesn't get filtered out later by accident. + this.log("Altering sibling:", sibling, "to div."); + + sibling = this._setNodeTag(sibling, "DIV"); + } + + articleContent.appendChild(sibling); + // Fetch children again to make it compatible + // with DOM parsers without live collection support. + siblings = parentOfTopCandidate.children; + // siblings is a reference to the children array, and + // sibling is removed from the array when we call appendChild(). + // As a result, we must revisit this index since the nodes + // have been shifted. + s -= 1; + sl -= 1; + } + } + + if (this._debug) + this.log("Article content pre-prep: " + articleContent.innerHTML); + // So we have all of the content that we need. Now we clean it up for presentation. + this._prepArticle(articleContent); + if (this._debug) + this.log("Article content post-prep: " + articleContent.innerHTML); + + if (neededToCreateTopCandidate) { + // We already created a fake div thing, and there wouldn't have been any siblings left + // for the previous loop, so there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class names here. No need to append + // because that already happened anyway. + topCandidate.id = "readability-page-1"; + topCandidate.className = "page"; + } else { + var div = doc.createElement("DIV"); + div.id = "readability-page-1"; + div.className = "page"; + while (articleContent.firstChild) { + div.appendChild(articleContent.firstChild); + } + articleContent.appendChild(div); + } + + if (this._debug) + this.log("Article content after paging: " + articleContent.innerHTML); + + var parseSuccessful = true; + + // Now that we've gone through the full algorithm, check to see if + // we got any meaningful content. If we didn't, we may need to re-run + // grabArticle with different flags set. This gives us a higher likelihood of + // finding the content, and the sieve approach gives us a higher likelihood of + // finding the -right- content. + var textLength = this._getInnerText(articleContent, true).length; + if (textLength < this._charThreshold) { + parseSuccessful = false; + page.innerHTML = pageCacheHtml; + + if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { + this._removeFlag(this.FLAG_STRIP_UNLIKELYS); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { + this._removeFlag(this.FLAG_WEIGHT_CLASSES); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { + this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else { + this._attempts.push({articleContent: articleContent, textLength: textLength}); + // No luck after removing flags, just return the longest text we found during the different loops + this._attempts.sort(function (a, b) { + return b.textLength - a.textLength; + }); + + // But first check if we actually have something + if (!this._attempts[0].textLength) { + return null; + } + + articleContent = this._attempts[0].articleContent; + parseSuccessful = true; + } + } + + if (parseSuccessful) { + // Find out text direction from ancestors of final top candidate. + var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); + this._someNode(ancestors, function(ancestor) { + if (!ancestor.tagName) + return false; + var articleDir = ancestor.getAttribute("dir"); + if (articleDir) { + this._articleDir = articleDir; + return true; + } + return false; + }); + return articleContent; + } + } + }, + + /** + * Check whether the input string could be a byline. + * This verifies that the input is a string, and that the length + * is less than 100 chars. + * + * @param possibleByline {string} - a string to check whether its a byline. + * @return Boolean - whether the input string is a byline. + */ + _isValidByline: function(byline) { + if (typeof byline == "string" || byline instanceof String) { + byline = byline.trim(); + return (byline.length > 0) && (byline.length < 100); + } + return false; + }, + + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var metadata; + + this._forEachNode(scripts, function(jsonLdElement) { + if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org$/) + ) { + return; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return; + } + + metadata = {}; + + if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. + + var title = this._getArticleTitle(); + var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; + var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; + + if (headlineMatches && !nameMatches) { + metadata.title = parsed.headline; + } else { + metadata.title = parsed.name; + } + } else if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author) { + if (typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + metadata.byline = parsed.author + .filter(function(author) { + return author && typeof author.name === "string"; + }) + .map(function(author) { + return author.name.trim(); + }) + .join(", "); + } + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + return; + } catch (err) { + this.log(err.message); + } + } + }); + return metadata ? metadata : {}; + }, + + /** + * Attempts to get excerpt and byline metadata for the article. + * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * + * @return Object with optional "excerpt" and "byline" properties + */ + _getArticleMetadata: function(jsonld) { + var metadata = {}; + var values = {}; + var metaElements = this._doc.getElementsByTagName("meta"); + + // property is a space-separated list of values + var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; + + // name is a single value + var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; + + // Find description tags. + this._forEachNode(metaElements, function(element) { + var elementName = element.getAttribute("name"); + var elementProperty = element.getAttribute("property"); + var content = element.getAttribute("content"); + if (!content) { + return; + } + var matches = null; + var name = null; + + if (elementProperty) { + matches = elementProperty.match(propertyPattern); + if (matches) { + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[0].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); + } + } + if (!matches && elementName && namePattern.test(elementName)) { + name = elementName; + if (content) { + // Convert to lowercase, remove any whitespace, and convert dots + // to colons so we can match below. + name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); + values[name] = content.trim(); + } + } + }); + + // get title + metadata.title = jsonld.title || + values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"]; + + if (!metadata.title) { + metadata.title = this._getArticleTitle(); + } + + // get author + metadata.byline = jsonld.byline || + values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; + + // get description + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; + + // get site name + metadata.siteName = jsonld.siteName || + values["og:site_name"]; + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); + + return metadata; + }, + + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all