platypush-ext-save-link.js
· 6.7 KiB · JavaScript
Raw
/**
* A script for the Platypush browser extension that saves the current page URL to Wallabag.
* Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results.
*/
// Entry point for the script, which is executed when the user runs the
// associated action. All the logic should be encapsulated in this function.
async (app, args) => {
// This is the base path where the scraped pages will be saved.
// For sake of simplicity, we will save the scraped pages to a local directory
// on the same server where the Platypush service is running.
// If you want to push it to another server, you can replace the call to
// `file.write` at the bottom of the script with `ssh.put`
// (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put)
// (ensure that the `ssh` plugin is enabled on your Platypush instance).
const savePath = `/CHANGEME`;
// This is the URL where the scraped pages will be served from.
// The simplest way to configure it is to set up a web server that serves
// the files in the `savePath` directory (python -m http.server should
// suffice), and then configure a reverse proxy to point to your server -
// or even configure nginx itself to both serve the files and handle SSL.
// It is strongly recommended to use HTTPS for this URL, as Wallabag
// will probably refuse to scrape HTTP URLs.
const scrapeUrl = 'https://scraped.example.com';
// Get the page URL and DOM
const url = await app.getURL();
const dom = await app.getDOM();
// A utility function that generates a unique hash code for a given string.
// This is used to create a unique filename based on the URL.
const hashCode = (str) => {
let hash = 0;
for (let i = 0, len = str.length; i < len; i++) {
let chr = str.charCodeAt(i);
hash = (hash << 5) - hash + chr;
hash |= 0; // Convert to 32bit integer
}
return hash;
};
// Utility functions to get the base URL and base relative URL from a given URL string.
const getBaseUrl = (urlString) => {
const url = new URL(urlString);
const protocol = url.protocol;
const hostname = url.hostname;
const port = url.port;
return `${protocol}//${hostname}${port ? ':' + port : ''}`;
};
// This function extracts the base relative URL (without the filename) from a given URL string.
const getBaseRelativeUrl = (urlString) => {
try {
let url = new URL(urlString);
let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
return url.origin + pathWithoutFilename;
} catch (error) {
return urlString;
}
};
const baseUrl = getBaseUrl(url);
const baseRelativeUrl = getBaseRelativeUrl(url);
// This function replaces relative URLs in the DOM with absolute URLs based
// on the original base URL. This is necessary to ensure that links and images
// point to the correct location when the page is saved or shared.
const replaceRelativeUrls = () => {
const relativeLinks = [...dom.querySelectorAll('a')]
.filter((a) =>
a.getAttribute('href')?.length &&
!a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/)
);
const relativeImgs = [...dom.querySelectorAll('img')]
.filter((a) =>
a.getAttribute('src')?.length &&
!a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/)
);
[...relativeLinks, ...relativeImgs].forEach((el) => {
const tag = el.tagName.toLowerCase();
const attrName = tag === 'img' ? 'src' : 'href';
const attrValue = el.getAttribute(attrName);
if (attrValue?.startsWith('/')) {
el.setAttribute(attrName, `${baseUrl}${attrValue}`);
} else {
el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`);
}
});
};
// This function checks if the current DOM has already been
// simplified/distilled by the Reader Mode script. If that's the case, then
// we can directly save the simplified content on the server, and let
// Wallabag scrape that URL. This ensures that any client-side restrictions
// that may prevent Wallabag from scraping the original page are bypassed.
const getSaveUrl = async () => {
// Check if the current DOM has already been "distilled" by the Mercury script
const simplifiedContainer = dom.querySelector('.platypush__simplified-body');
// If that's not the case, save the original URL as it is
if (!simplifiedContainer) {
return url;
}
// Otherwise, upload the simplified content to a proxy
const html = document.createElement('html');
const head = document.createElement('head');
const title = document.createElement('title');
const meta = document.createElement('meta');
const body = document.createElement('body');
const originalLinkDiv = document.createElement('b');
const originalLink = document.createElement('a');
// Replace the relative URLs in the simplified content
replaceRelativeUrls();
// Set up the HTML structure
title.innerText = dom.querySelector('head title')?.innerText;
meta.setAttribute('charset', 'utf-8');
// Put a link to the original page in the body
originalLink.setAttribute('href', url);
originalLink.setAttribute('target', '_blank');
originalLink.innerText = 'Original link';
originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`;
// Build the HTML document
head.appendChild(title);
head.appendChild(meta);
body.appendChild(originalLinkDiv);
body.appendChild(simplifiedContainer);
html.appendChild(head);
html.appendChild(body);
// Generate a unique filename based on the URL hash
const filename = `${hashCode(url)}.html`;
const outfile = `${savePath}/${filename}`;
// Upload it as HTML to the server
await app.run({
action: 'file.write',
args: {
file: outfile,
content: html.outerHTML,
},
}, args.host);
return `${scrapeUrl}/${filename}`;
}
// Get the URL to save - either the original one, or the simplified one if
// the Reader Mode script has already been applied.
const urlToSave = await getSaveUrl();
const response = await app.run({
action: 'wallabag.save',
args: {
url: urlToSave,
}
}, args.host);
// Send a notification to the user with the result of the save operation
app.notify('Wallabag Save', response.title);
// Optional, if ntfy is enabled, you can send a notification to the user
// that will be received by any client running ntfy
// app.run({
// action: 'ntfy.send_message',
// args: {
// topic: 'wallabag-save-some-random-string',
// title: 'Saved on Wallabag',
// message: response.title,
// url: response.url,
// }
// }, args.host);
}
| 1 | /** |
| 2 | * A script for the Platypush browser extension that saves the current page URL to Wallabag. |
| 3 | * Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results. |
| 4 | */ |
| 5 | |
| 6 | // Entry point for the script, which is executed when the user runs the |
| 7 | // associated action. All the logic should be encapsulated in this function. |
| 8 | async (app, args) => { |
| 9 | // This is the base path where the scraped pages will be saved. |
| 10 | // For sake of simplicity, we will save the scraped pages to a local directory |
| 11 | // on the same server where the Platypush service is running. |
| 12 | // If you want to push it to another server, you can replace the call to |
| 13 | // `file.write` at the bottom of the script with `ssh.put` |
| 14 | // (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put) |
| 15 | // (ensure that the `ssh` plugin is enabled on your Platypush instance). |
| 16 | const savePath = `/CHANGEME`; |
| 17 | |
| 18 | // This is the URL where the scraped pages will be served from. |
| 19 | // The simplest way to configure it is to set up a web server that serves |
| 20 | // the files in the `savePath` directory (python -m http.server should |
| 21 | // suffice), and then configure a reverse proxy to point to your server - |
| 22 | // or even configure nginx itself to both serve the files and handle SSL. |
| 23 | // It is strongly recommended to use HTTPS for this URL, as Wallabag |
| 24 | // will probably refuse to scrape HTTP URLs. |
| 25 | const scrapeUrl = 'https://scraped.example.com'; |
| 26 | |
| 27 | // Get the page URL and DOM |
| 28 | const url = await app.getURL(); |
| 29 | const dom = await app.getDOM(); |
| 30 | |
| 31 | // A utility function that generates a unique hash code for a given string. |
| 32 | // This is used to create a unique filename based on the URL. |
| 33 | const hashCode = (str) => { |
| 34 | let hash = 0; |
| 35 | for (let i = 0, len = str.length; i < len; i++) { |
| 36 | let chr = str.charCodeAt(i); |
| 37 | hash = (hash << 5) - hash + chr; |
| 38 | hash |= 0; // Convert to 32bit integer |
| 39 | } |
| 40 | return hash; |
| 41 | }; |
| 42 | |
| 43 | // Utility functions to get the base URL and base relative URL from a given URL string. |
| 44 | const getBaseUrl = (urlString) => { |
| 45 | const url = new URL(urlString); |
| 46 | const protocol = url.protocol; |
| 47 | const hostname = url.hostname; |
| 48 | const port = url.port; |
| 49 | return `${protocol}//${hostname}${port ? ':' + port : ''}`; |
| 50 | }; |
| 51 | |
| 52 | // This function extracts the base relative URL (without the filename) from a given URL string. |
| 53 | const getBaseRelativeUrl = (urlString) => { |
| 54 | try { |
| 55 | let url = new URL(urlString); |
| 56 | let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1); |
| 57 | return url.origin + pathWithoutFilename; |
| 58 | } catch (error) { |
| 59 | return urlString; |
| 60 | } |
| 61 | }; |
| 62 | |
| 63 | const baseUrl = getBaseUrl(url); |
| 64 | const baseRelativeUrl = getBaseRelativeUrl(url); |
| 65 | |
| 66 | // This function replaces relative URLs in the DOM with absolute URLs based |
| 67 | // on the original base URL. This is necessary to ensure that links and images |
| 68 | // point to the correct location when the page is saved or shared. |
| 69 | const replaceRelativeUrls = () => { |
| 70 | const relativeLinks = [...dom.querySelectorAll('a')] |
| 71 | .filter((a) => |
| 72 | a.getAttribute('href')?.length && |
| 73 | !a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/) |
| 74 | ); |
| 75 | |
| 76 | const relativeImgs = [...dom.querySelectorAll('img')] |
| 77 | .filter((a) => |
| 78 | a.getAttribute('src')?.length && |
| 79 | !a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/) |
| 80 | ); |
| 81 | |
| 82 | [...relativeLinks, ...relativeImgs].forEach((el) => { |
| 83 | const tag = el.tagName.toLowerCase(); |
| 84 | const attrName = tag === 'img' ? 'src' : 'href'; |
| 85 | const attrValue = el.getAttribute(attrName); |
| 86 | if (attrValue?.startsWith('/')) { |
| 87 | el.setAttribute(attrName, `${baseUrl}${attrValue}`); |
| 88 | } else { |
| 89 | el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`); |
| 90 | } |
| 91 | }); |
| 92 | }; |
| 93 | |
| 94 | // This function checks if the current DOM has already been |
| 95 | // simplified/distilled by the Reader Mode script. If that's the case, then |
| 96 | // we can directly save the simplified content on the server, and let |
| 97 | // Wallabag scrape that URL. This ensures that any client-side restrictions |
| 98 | // that may prevent Wallabag from scraping the original page are bypassed. |
| 99 | const getSaveUrl = async () => { |
| 100 | // Check if the current DOM has already been "distilled" by the Mercury script |
| 101 | const simplifiedContainer = dom.querySelector('.platypush__simplified-body'); |
| 102 | |
| 103 | // If that's not the case, save the original URL as it is |
| 104 | if (!simplifiedContainer) { |
| 105 | return url; |
| 106 | } |
| 107 | |
| 108 | // Otherwise, upload the simplified content to a proxy |
| 109 | const html = document.createElement('html'); |
| 110 | const head = document.createElement('head'); |
| 111 | const title = document.createElement('title'); |
| 112 | const meta = document.createElement('meta'); |
| 113 | const body = document.createElement('body'); |
| 114 | const originalLinkDiv = document.createElement('b'); |
| 115 | const originalLink = document.createElement('a'); |
| 116 | |
| 117 | // Replace the relative URLs in the simplified content |
| 118 | replaceRelativeUrls(); |
| 119 | |
| 120 | // Set up the HTML structure |
| 121 | title.innerText = dom.querySelector('head title')?.innerText; |
| 122 | meta.setAttribute('charset', 'utf-8'); |
| 123 | |
| 124 | // Put a link to the original page in the body |
| 125 | originalLink.setAttribute('href', url); |
| 126 | originalLink.setAttribute('target', '_blank'); |
| 127 | originalLink.innerText = 'Original link'; |
| 128 | originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`; |
| 129 | |
| 130 | // Build the HTML document |
| 131 | head.appendChild(title); |
| 132 | head.appendChild(meta); |
| 133 | body.appendChild(originalLinkDiv); |
| 134 | body.appendChild(simplifiedContainer); |
| 135 | html.appendChild(head); |
| 136 | html.appendChild(body); |
| 137 | |
| 138 | // Generate a unique filename based on the URL hash |
| 139 | const filename = `${hashCode(url)}.html`; |
| 140 | const outfile = `${savePath}/${filename}`; |
| 141 | |
| 142 | // Upload it as HTML to the server |
| 143 | await app.run({ |
| 144 | action: 'file.write', |
| 145 | args: { |
| 146 | file: outfile, |
| 147 | content: html.outerHTML, |
| 148 | }, |
| 149 | }, args.host); |
| 150 | |
| 151 | return `${scrapeUrl}/${filename}`; |
| 152 | } |
| 153 | |
| 154 | // Get the URL to save - either the original one, or the simplified one if |
| 155 | // the Reader Mode script has already been applied. |
| 156 | const urlToSave = await getSaveUrl(); |
| 157 | |
| 158 | const response = await app.run({ |
| 159 | action: 'wallabag.save', |
| 160 | args: { |
| 161 | url: urlToSave, |
| 162 | } |
| 163 | }, args.host); |
| 164 | |
| 165 | // Send a notification to the user with the result of the save operation |
| 166 | app.notify('Wallabag Save', response.title); |
| 167 | |
| 168 | // Optional, if ntfy is enabled, you can send a notification to the user |
| 169 | // that will be received by any client running ntfy |
| 170 | // app.run({ |
| 171 | // action: 'ntfy.send_message', |
| 172 | // args: { |
| 173 | // topic: 'wallabag-save-some-random-string', |
| 174 | // title: 'Saved on Wallabag', |
| 175 | // message: response.title, |
| 176 | // url: response.url, |
| 177 | // } |
| 178 | // }, args.host); |
| 179 | } |