platypush-ext-save-link.js

Revision 055aa6dc58031a305f8819e742a25bd618ec46fd

platypush-ext-save-link.js · 6.7 KiB · JavaScript Raw

/** * A script for the Platypush browser extension that saves the current page URL to Wallabag. * Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results. */ // Entry point for the script, which is executed when the user runs the // associated action. All the logic should be encapsulated in this function. async (app, args) => { // This is the base path where the scraped pages will be saved. // For sake of simplicity, we will save the scraped pages to a local directory // on the same server where the Platypush service is running. // If you want to push it to another server, you can replace the call to // `file.write` at the bottom of the script with `ssh.put` // (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put) // (ensure that the `ssh` plugin is enabled on your Platypush instance). const savePath = `/CHANGEME`; // This is the URL where the scraped pages will be served from. // The simplest way to configure it is to set up a web server that serves // the files in the `savePath` directory (python -m http.server should // suffice), and then configure a reverse proxy to point to your server - // or even configure nginx itself to both serve the files and handle SSL. // It is strongly recommended to use HTTPS for this URL, as Wallabag // will probably refuse to scrape HTTP URLs. const scrapeUrl = 'https://scraped.example.com'; // Get the page URL and DOM const url = await app.getURL(); const dom = await app.getDOM(); // A utility function that generates a unique hash code for a given string. // This is used to create a unique filename based on the URL. const hashCode = (str) => { let hash = 0; for (let i = 0, len = str.length; i < len; i++) { let chr = str.charCodeAt(i); hash = (hash << 5) - hash + chr; hash |= 0; // Convert to 32bit integer } return hash; }; // Utility functions to get the base URL and base relative URL from a given URL string. const getBaseUrl = (urlString) => { const url = new URL(urlString); const protocol = url.protocol; const hostname = url.hostname; const port = url.port; return `${protocol}//${hostname}${port ? ':' + port : ''}`; }; // This function extracts the base relative URL (without the filename) from a given URL string. const getBaseRelativeUrl = (urlString) => { try { let url = new URL(urlString); let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1); return url.origin + pathWithoutFilename; } catch (error) { return urlString; } }; const baseUrl = getBaseUrl(url); const baseRelativeUrl = getBaseRelativeUrl(url); // This function replaces relative URLs in the DOM with absolute URLs based // on the original base URL. This is necessary to ensure that links and images // point to the correct location when the page is saved or shared. const replaceRelativeUrls = () => { const relativeLinks = [...dom.querySelectorAll('a')] .filter((a) => a.getAttribute('href')?.length && !a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/) ); const relativeImgs = [...dom.querySelectorAll('img')] .filter((a) => a.getAttribute('src')?.length && !a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/) ); [...relativeLinks, ...relativeImgs].forEach((el) => { const tag = el.tagName.toLowerCase(); const attrName = tag === 'img' ? 'src' : 'href'; const attrValue = el.getAttribute(attrName); if (attrValue?.startsWith('/')) { el.setAttribute(attrName, `${baseUrl}${attrValue}`); } else { el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`); } }); }; // This function checks if the current DOM has already been // simplified/distilled by the Reader Mode script. If that's the case, then // we can directly save the simplified content on the server, and let // Wallabag scrape that URL. This ensures that any client-side restrictions // that may prevent Wallabag from scraping the original page are bypassed. const getSaveUrl = async () => { // Check if the current DOM has already been "distilled" by the Mercury script const simplifiedContainer = dom.querySelector('.platypush__simplified-body'); // If that's not the case, save the original URL as it is if (!simplifiedContainer) { return url; } // Otherwise, upload the simplified content to a proxy const html = document.createElement('html'); const head = document.createElement('head'); const title = document.createElement('title'); const meta = document.createElement('meta'); const body = document.createElement('body'); const originalLinkDiv = document.createElement('b'); const originalLink = document.createElement('a'); // Replace the relative URLs in the simplified content replaceRelativeUrls(); // Set up the HTML structure title.innerText = dom.querySelector('head title')?.innerText; meta.setAttribute('charset', 'utf-8'); // Put a link to the original page in the body originalLink.setAttribute('href', url); originalLink.setAttribute('target', '_blank'); originalLink.innerText = 'Original link'; originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`; // Build the HTML document head.appendChild(title); head.appendChild(meta); body.appendChild(originalLinkDiv); body.appendChild(simplifiedContainer); html.appendChild(head); html.appendChild(body); // Generate a unique filename based on the URL hash const filename = `${hashCode(url)}.html`; const outfile = `${savePath}/${filename}`; // Upload it as HTML to the server await app.run({ action: 'file.write', args: { file: outfile, content: html.outerHTML, }, }, args.host); return `${scrapeUrl}/${filename}`; } // Get the URL to save - either the original one, or the simplified one if // the Reader Mode script has already been applied. const urlToSave = await getSaveUrl(); const response = await app.run({ action: 'wallabag.save', args: { url: urlToSave, } }, args.host); // Send a notification to the user with the result of the save operation app.notify('Wallabag Save', response.title); // Optional, if ntfy is enabled, you can send a notification to the user // that will be received by any client running ntfy // app.run({ // action: 'ntfy.send_message', // args: { // topic: 'wallabag-save-some-random-string', // title: 'Saved on Wallabag', // message: response.title, // url: response.url, // } // }, args.host); }

1	/**
2	* A script for the Platypush browser extension that saves the current page URL to Wallabag.
3	* Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results.
4	*/
5
6	// Entry point for the script, which is executed when the user runs the
7	// associated action. All the logic should be encapsulated in this function.
8	async (app, args) => {
9	// This is the base path where the scraped pages will be saved.
10	// For sake of simplicity, we will save the scraped pages to a local directory
11	// on the same server where the Platypush service is running.
12	// If you want to push it to another server, you can replace the call to
13	// `file.write` at the bottom of the script with `ssh.put`
14	// (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put)
15	// (ensure that the `ssh` plugin is enabled on your Platypush instance).
16	const savePath = `/CHANGEME`;
17
18	// This is the URL where the scraped pages will be served from.
19	// The simplest way to configure it is to set up a web server that serves
20	// the files in the `savePath` directory (python -m http.server should
21	// suffice), and then configure a reverse proxy to point to your server -
22	// or even configure nginx itself to both serve the files and handle SSL.
23	// It is strongly recommended to use HTTPS for this URL, as Wallabag
24	// will probably refuse to scrape HTTP URLs.
25	const scrapeUrl = 'https://scraped.example.com';
26
27	// Get the page URL and DOM
28	const url = await app.getURL();
29	const dom = await app.getDOM();
30
31	// A utility function that generates a unique hash code for a given string.
32	// This is used to create a unique filename based on the URL.
33	const hashCode = (str) => {
34	let hash = 0;
35	for (let i = 0, len = str.length; i < len; i++) {
36	let chr = str.charCodeAt(i);
37	hash = (hash << 5) - hash + chr;
38	hash \|= 0; // Convert to 32bit integer
39	}
40	return hash;
41	};
42
43	// Utility functions to get the base URL and base relative URL from a given URL string.
44	const getBaseUrl = (urlString) => {
45	const url = new URL(urlString);
46	const protocol = url.protocol;
47	const hostname = url.hostname;
48	const port = url.port;
49	return `${protocol}//${hostname}${port ? ':' + port : ''}`;
50	};
51
52	// This function extracts the base relative URL (without the filename) from a given URL string.
53	const getBaseRelativeUrl = (urlString) => {
54	try {
55	let url = new URL(urlString);
56	let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1);
57	return url.origin + pathWithoutFilename;
58	} catch (error) {
59	return urlString;
60	}
61	};
62
63	const baseUrl = getBaseUrl(url);
64	const baseRelativeUrl = getBaseRelativeUrl(url);
65
66	// This function replaces relative URLs in the DOM with absolute URLs based
67	// on the original base URL. This is necessary to ensure that links and images
68	// point to the correct location when the page is saved or shared.
69	const replaceRelativeUrls = () => {
70	const relativeLinks = [...dom.querySelectorAll('a')]
71	.filter((a) =>
72	a.getAttribute('href')?.length &&
73	!a.getAttribute('href')?.match(/^(https?:\/\/)\|(javascript:)/)
74	);
75
76	const relativeImgs = [...dom.querySelectorAll('img')]
77	.filter((a) =>
78	a.getAttribute('src')?.length &&
79	!a.getAttribute('src')?.match(/^(https?:\/\/)\|(data:image\/)/)
80	);
81
82	[...relativeLinks, ...relativeImgs].forEach((el) => {
83	const tag = el.tagName.toLowerCase();
84	const attrName = tag === 'img' ? 'src' : 'href';
85	const attrValue = el.getAttribute(attrName);
86	if (attrValue?.startsWith('/')) {
87	el.setAttribute(attrName, `${baseUrl}${attrValue}`);
88	} else {
89	el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`);
90	}
91	});
92	};
93
94	// This function checks if the current DOM has already been
95	// simplified/distilled by the Reader Mode script. If that's the case, then
96	// we can directly save the simplified content on the server, and let
97	// Wallabag scrape that URL. This ensures that any client-side restrictions
98	// that may prevent Wallabag from scraping the original page are bypassed.
99	const getSaveUrl = async () => {
100	// Check if the current DOM has already been "distilled" by the Mercury script
101	const simplifiedContainer = dom.querySelector('.platypush__simplified-body');
102
103	// If that's not the case, save the original URL as it is
104	if (!simplifiedContainer) {
105	return url;
106	}
107
108	// Otherwise, upload the simplified content to a proxy
109	const html = document.createElement('html');
110	const head = document.createElement('head');
111	const title = document.createElement('title');
112	const meta = document.createElement('meta');
113	const body = document.createElement('body');
114	const originalLinkDiv = document.createElement('b');
115	const originalLink = document.createElement('a');
116
117	// Replace the relative URLs in the simplified content
118	replaceRelativeUrls();
119
120	// Set up the HTML structure
121	title.innerText = dom.querySelector('head title')?.innerText;
122	meta.setAttribute('charset', 'utf-8');
123
124	// Put a link to the original page in the body
125	originalLink.setAttribute('href', url);
126	originalLink.setAttribute('target', '_blank');
127	originalLink.innerText = 'Original link';
128	originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`;
129
130	// Build the HTML document
131	head.appendChild(title);
132	head.appendChild(meta);
133	body.appendChild(originalLinkDiv);
134	body.appendChild(simplifiedContainer);
135	html.appendChild(head);
136	html.appendChild(body);
137
138	// Generate a unique filename based on the URL hash
139	const filename = `${hashCode(url)}.html`;
140	const outfile = `${savePath}/${filename}`;
141
142	// Upload it as HTML to the server
143	await app.run({
144	action: 'file.write',
145	args: {
146	file: outfile,
147	content: html.outerHTML,
148	},
149	}, args.host);
150
151	return `${scrapeUrl}/${filename}`;
152	}
153
154	// Get the URL to save - either the original one, or the simplified one if
155	// the Reader Mode script has already been applied.
156	const urlToSave = await getSaveUrl();
157
158	const response = await app.run({
159	action: 'wallabag.save',
160	args: {
161	url: urlToSave,
162	}
163	}, args.host);
164
165	// Send a notification to the user with the result of the save operation
166	app.notify('Wallabag Save', response.title);
167
168	// Optional, if ntfy is enabled, you can send a notification to the user
169	// that will be received by any client running ntfy
170	// app.run({
171	// action: 'ntfy.send_message',
172	// args: {
173	// topic: 'wallabag-save-some-random-string',
174	// title: 'Saved on Wallabag',
175	// message: response.title,
176	// url: response.url,
177	// }
178	// }, args.host);
179	}