platypush-ext-save-link.js
· 6.7 KiB · JavaScript
Raw
/**
* A script for the Platypush browser extension that saves the current page URL to Wallabag.
* Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results.
*/
// Entry point for the script, which is executed when the user runs the
// associated action. All the logic should be encapsulated in this function.
async (app, args) => {
// This is the base path where the scraped pages will be saved.
// For sake of simplicity, we will save the scraped pages to a local directory
// on the same server where the Platypush service is running.
// If you want to push it to another server, you can replace the call to
// `file.write` at the bottom of the script with `ssh.put`
// (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put)
// (ensure that the `ssh` plugin is enabled on your Platypush instance).
const savePath = `/CHANGEME`;
// This is the URL where the scraped pages will be served from.
// The simplest way to configure it is to set up a web server that serves
// the files in the `savePath` directory (python -m http.server should
// suffice), and then configure a reverse proxy to point to your server -
// or even configure nginx itself to both serve the files and handle SSL.
// It is strongly recommended to use HTTPS for this URL, as Wallabag
// will probably refuse to scrape HTTP URLs.
const scrapeUrl = 'https://scraped.example.com';
// Get the page URL and DOM
const url = await app.getURL();
const dom = await app.getDOM();
// A utility function that generates a unique hash code for a given string.
// This is used to create a unique filename based on the URL.
const hashCode = (str) => {
let hash = 0;
for (let i = 0, len = str.length; i < len; i++) {
let chr = str.charCodeAt(i);
hash = (hash << 5) - hash + chr;
hash |= 0; // Convert to 32bit integer
}
return hash;
};
// Utility functions to get the base URL and base relative URL from a given URL string.
const getBaseUrl = (urlString) => {
const url = new URL(urlString);
const protocol = url.protocol;
const hostname = url.hostname;
const port = url.port;
return `${protocol}//${hostname}${port ? ':' + port : ''}`;
};
// This function extracts the base relative URL (without the filename) from a given URL string.
const getBaseRelativeUrl = (urlString) => {
try {
let url = new URL(urlString);
let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
return url.origin + pathWithoutFilename;
} catch (error) {
return urlString;
}
};
const baseUrl = getBaseUrl(url);
const baseRelativeUrl = getBaseRelativeUrl(url);
// This function replaces relative URLs in the DOM with absolute URLs based
// on the original base URL. This is necessary to ensure that links and images
// point to the correct location when the page is saved or shared.
const replaceRelativeUrls = () => {
const relativeLinks = [...dom.querySelectorAll('a')]
.filter((a) =>
a.getAttribute('href')?.length &&
!a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/)
);
const relativeImgs = [...dom.querySelectorAll('img')]
.filter((a) =>
a.getAttribute('src')?.length &&
!a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/)
);
[...relativeLinks, ...relativeImgs].forEach((el) => {
const tag = el.tagName.toLowerCase();
const attrName = tag === 'img' ? 'src' : 'href';
const attrValue = el.getAttribute(attrName);
if (attrValue?.startsWith('/')) {
el.setAttribute(attrName, `${baseUrl}${attrValue}`);
} else {
el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`);
}
});
};
// This function checks if the current DOM has already been
// simplified/distilled by the Reader Mode script. If that's the case, then
// we can directly save the simplified content on the server, and let
// Wallabag scrape that URL. This ensures that any client-side restrictions
// that may prevent Wallabag from scraping the original page are bypassed.
const getSaveUrl = async () => {
// Check if the current DOM has already been "distilled" by the Mercury script
const simplifiedContainer = dom.querySelector('.platypush__simplified-body');
// If that's not the case, save the original URL as it is
if (!simplifiedContainer) {
return url;
}
// Otherwise, upload the simplified content to a proxy
const html = document.createElement('html');
const head = document.createElement('head');
const title = document.createElement('title');
const meta = document.createElement('meta');
const body = document.createElement('body');
const originalLinkDiv = document.createElement('b');
const originalLink = document.createElement('a');
// Replace the relative URLs in the simplified content
replaceRelativeUrls();
// Set up the HTML structure
title.innerText = dom.querySelector('head title')?.innerText;
meta.setAttribute('charset', 'utf-8');
// Put a link to the original page in the body
originalLink.setAttribute('href', url);
originalLink.setAttribute('target', '_blank');
originalLink.innerText = 'Original link';
originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`;
// Build the HTML document
head.appendChild(title);
head.appendChild(meta);
body.appendChild(originalLinkDiv);
body.appendChild(simplifiedContainer);
html.appendChild(head);
html.appendChild(body);
// Generate a unique filename based on the URL hash
const filename = `${hashCode(url)}.html`;
const outfile = `${savePath}/${filename}`;
// Upload it as HTML to the server
await app.run({
action: 'file.write',
args: {
file: outfile,
content: html.outerHTML,
},
}, args.host);
return `${scrapeUrl}/${filename}`;
}
// Get the URL to save - either the original one, or the simplified one if
// the Reader Mode script has already been applied.
const urlToSave = await getSaveUrl();
const response = await app.run({
action: 'wallabag.save',
args: {
url: urlToSave,
}
}, args.host);
// Send a notification to the user with the result of the save operation
app.notify('Wallabag Save', response.title);
// Optional, if ntfy is enabled, you can send a notification to the user
// that will be received by any client running ntfy
// app.run({
// action: 'ntfy.send_message',
// args: {
// topic: 'wallabag-save-some-random-string',
// title: 'Saved on Wallabag',
// message: response.title,
// url: response.url,
// }
// }, args.host);
}
1 | /** |
2 | * A script for the Platypush browser extension that saves the current page URL to Wallabag. |
3 | * Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results. |
4 | */ |
5 | |
6 | // Entry point for the script, which is executed when the user runs the |
7 | // associated action. All the logic should be encapsulated in this function. |
8 | async (app, args) => { |
9 | // This is the base path where the scraped pages will be saved. |
10 | // For sake of simplicity, we will save the scraped pages to a local directory |
11 | // on the same server where the Platypush service is running. |
12 | // If you want to push it to another server, you can replace the call to |
13 | // `file.write` at the bottom of the script with `ssh.put` |
14 | // (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put) |
15 | // (ensure that the `ssh` plugin is enabled on your Platypush instance). |
16 | const savePath = `/CHANGEME`; |
17 | |
18 | // This is the URL where the scraped pages will be served from. |
19 | // The simplest way to configure it is to set up a web server that serves |
20 | // the files in the `savePath` directory (python -m http.server should |
21 | // suffice), and then configure a reverse proxy to point to your server - |
22 | // or even configure nginx itself to both serve the files and handle SSL. |
23 | // It is strongly recommended to use HTTPS for this URL, as Wallabag |
24 | // will probably refuse to scrape HTTP URLs. |
25 | const scrapeUrl = 'https://scraped.example.com'; |
26 | |
27 | // Get the page URL and DOM |
28 | const url = await app.getURL(); |
29 | const dom = await app.getDOM(); |
30 | |
31 | // A utility function that generates a unique hash code for a given string. |
32 | // This is used to create a unique filename based on the URL. |
33 | const hashCode = (str) => { |
34 | let hash = 0; |
35 | for (let i = 0, len = str.length; i < len; i++) { |
36 | let chr = str.charCodeAt(i); |
37 | hash = (hash << 5) - hash + chr; |
38 | hash |= 0; // Convert to 32bit integer |
39 | } |
40 | return hash; |
41 | }; |
42 | |
43 | // Utility functions to get the base URL and base relative URL from a given URL string. |
44 | const getBaseUrl = (urlString) => { |
45 | const url = new URL(urlString); |
46 | const protocol = url.protocol; |
47 | const hostname = url.hostname; |
48 | const port = url.port; |
49 | return `${protocol}//${hostname}${port ? ':' + port : ''}`; |
50 | }; |
51 | |
52 | // This function extracts the base relative URL (without the filename) from a given URL string. |
53 | const getBaseRelativeUrl = (urlString) => { |
54 | try { |
55 | let url = new URL(urlString); |
56 | let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1); |
57 | return url.origin + pathWithoutFilename; |
58 | } catch (error) { |
59 | return urlString; |
60 | } |
61 | }; |
62 | |
63 | const baseUrl = getBaseUrl(url); |
64 | const baseRelativeUrl = getBaseRelativeUrl(url); |
65 | |
66 | // This function replaces relative URLs in the DOM with absolute URLs based |
67 | // on the original base URL. This is necessary to ensure that links and images |
68 | // point to the correct location when the page is saved or shared. |
69 | const replaceRelativeUrls = () => { |
70 | const relativeLinks = [...dom.querySelectorAll('a')] |
71 | .filter((a) => |
72 | a.getAttribute('href')?.length && |
73 | !a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/) |
74 | ); |
75 | |
76 | const relativeImgs = [...dom.querySelectorAll('img')] |
77 | .filter((a) => |
78 | a.getAttribute('src')?.length && |
79 | !a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/) |
80 | ); |
81 | |
82 | [...relativeLinks, ...relativeImgs].forEach((el) => { |
83 | const tag = el.tagName.toLowerCase(); |
84 | const attrName = tag === 'img' ? 'src' : 'href'; |
85 | const attrValue = el.getAttribute(attrName); |
86 | if (attrValue?.startsWith('/')) { |
87 | el.setAttribute(attrName, `${baseUrl}${attrValue}`); |
88 | } else { |
89 | el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`); |
90 | } |
91 | }); |
92 | }; |
93 | |
94 | // This function checks if the current DOM has already been |
95 | // simplified/distilled by the Reader Mode script. If that's the case, then |
96 | // we can directly save the simplified content on the server, and let |
97 | // Wallabag scrape that URL. This ensures that any client-side restrictions |
98 | // that may prevent Wallabag from scraping the original page are bypassed. |
99 | const getSaveUrl = async () => { |
100 | // Check if the current DOM has already been "distilled" by the Mercury script |
101 | const simplifiedContainer = dom.querySelector('.platypush__simplified-body'); |
102 | |
103 | // If that's not the case, save the original URL as it is |
104 | if (!simplifiedContainer) { |
105 | return url; |
106 | } |
107 | |
108 | // Otherwise, upload the simplified content to a proxy |
109 | const html = document.createElement('html'); |
110 | const head = document.createElement('head'); |
111 | const title = document.createElement('title'); |
112 | const meta = document.createElement('meta'); |
113 | const body = document.createElement('body'); |
114 | const originalLinkDiv = document.createElement('b'); |
115 | const originalLink = document.createElement('a'); |
116 | |
117 | // Replace the relative URLs in the simplified content |
118 | replaceRelativeUrls(); |
119 | |
120 | // Set up the HTML structure |
121 | title.innerText = dom.querySelector('head title')?.innerText; |
122 | meta.setAttribute('charset', 'utf-8'); |
123 | |
124 | // Put a link to the original page in the body |
125 | originalLink.setAttribute('href', url); |
126 | originalLink.setAttribute('target', '_blank'); |
127 | originalLink.innerText = 'Original link'; |
128 | originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`; |
129 | |
130 | // Build the HTML document |
131 | head.appendChild(title); |
132 | head.appendChild(meta); |
133 | body.appendChild(originalLinkDiv); |
134 | body.appendChild(simplifiedContainer); |
135 | html.appendChild(head); |
136 | html.appendChild(body); |
137 | |
138 | // Generate a unique filename based on the URL hash |
139 | const filename = `${hashCode(url)}.html`; |
140 | const outfile = `${savePath}/${filename}`; |
141 | |
142 | // Upload it as HTML to the server |
143 | await app.run({ |
144 | action: 'file.write', |
145 | args: { |
146 | file: outfile, |
147 | content: html.outerHTML, |
148 | }, |
149 | }, args.host); |
150 | |
151 | return `${scrapeUrl}/${filename}`; |
152 | } |
153 | |
154 | // Get the URL to save - either the original one, or the simplified one if |
155 | // the Reader Mode script has already been applied. |
156 | const urlToSave = await getSaveUrl(); |
157 | |
158 | const response = await app.run({ |
159 | action: 'wallabag.save', |
160 | args: { |
161 | url: urlToSave, |
162 | } |
163 | }, args.host); |
164 | |
165 | // Send a notification to the user with the result of the save operation |
166 | app.notify('Wallabag Save', response.title); |
167 | |
168 | // Optional, if ntfy is enabled, you can send a notification to the user |
169 | // that will be received by any client running ntfy |
170 | // app.run({ |
171 | // action: 'ntfy.send_message', |
172 | // args: { |
173 | // topic: 'wallabag-save-some-random-string', |
174 | // title: 'Saved on Wallabag', |
175 | // message: response.title, |
176 | // url: response.url, |
177 | // } |
178 | // }, args.host); |
179 | } |