fabio revised this gist . Go to revision
1 file changed, 25 insertions, 158 deletions
platypush-ext-save-link.js
@@ -3,177 +3,44 @@ | |||
3 | 3 | * Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results. | |
4 | 4 | */ | |
5 | 5 | ||
6 | + | ||
6 | 7 | // Entry point for the script, which is executed when the user runs the | |
7 | 8 | // associated action. All the logic should be encapsulated in this function. | |
8 | 9 | async (app, args) => { | |
9 | - | // This is the base path where the scraped pages will be saved. | |
10 | - | // For sake of simplicity, we will save the scraped pages to a local directory | |
11 | - | // on the same server where the Platypush service is running. | |
12 | - | // If you want to push it to another server, you can replace the call to | |
13 | - | // `file.write` at the bottom of the script with `ssh.put` | |
14 | - | // (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put) | |
15 | - | // (ensure that the `ssh` plugin is enabled on your Platypush instance). | |
16 | - | const savePath = `/CHANGEME`; | |
17 | - | ||
18 | - | // This is the URL where the scraped pages will be served from. | |
19 | - | // The simplest way to configure it is to set up a web server that serves | |
20 | - | // the files in the `savePath` directory (python -m http.server should | |
21 | - | // suffice), and then configure a reverse proxy to point to your server - | |
22 | - | // or even configure nginx itself to both serve the files and handle SSL. | |
23 | - | // It is strongly recommended to use HTTPS for this URL, as Wallabag | |
24 | - | // will probably refuse to scrape HTTP URLs. | |
25 | - | const scrapeUrl = 'https://scraped.example.com'; | |
10 | + | // (Optional) topic for the ntfy notification | |
11 | + | const ntfyTopic = 'notebook-saved-links-random-suffix'; | |
26 | 12 | ||
27 | 13 | // Get the page URL and DOM | |
28 | 14 | const url = await app.getURL(); | |
29 | 15 | const dom = await app.getDOM(); | |
30 | - | ||
31 | - | // A utility function that generates a unique hash code for a given string. | |
32 | - | // This is used to create a unique filename based on the URL. | |
33 | - | const hashCode = (str) => { | |
34 | - | let hash = 0; | |
35 | - | for (let i = 0, len = str.length; i < len; i++) { | |
36 | - | let chr = str.charCodeAt(i); | |
37 | - | hash = (hash << 5) - hash + chr; | |
38 | - | hash |= 0; // Convert to 32bit integer | |
39 | - | } | |
40 | - | return hash; | |
41 | - | }; | |
42 | - | ||
43 | - | // Utility functions to get the base URL and base relative URL from a given URL string. | |
44 | - | const getBaseUrl = (urlString) => { | |
45 | - | const url = new URL(urlString); | |
46 | - | const protocol = url.protocol; | |
47 | - | const hostname = url.hostname; | |
48 | - | const port = url.port; | |
49 | - | return `${protocol}//${hostname}${port ? ':' + port : ''}`; | |
50 | - | }; | |
51 | - | ||
52 | - | // This function extracts the base relative URL (without the filename) from a given URL string. | |
53 | - | const getBaseRelativeUrl = (urlString) => { | |
54 | - | try { | |
55 | - | let url = new URL(urlString); | |
56 | - | let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1); | |
57 | - | return url.origin + pathWithoutFilename; | |
58 | - | } catch (error) { | |
59 | - | return urlString; | |
60 | - | } | |
61 | - | }; | |
62 | - | ||
63 | - | const baseUrl = getBaseUrl(url); | |
64 | - | const baseRelativeUrl = getBaseRelativeUrl(url); | |
65 | - | ||
66 | - | // This function replaces relative URLs in the DOM with absolute URLs based | |
67 | - | // on the original base URL. This is necessary to ensure that links and images | |
68 | - | // point to the correct location when the page is saved or shared. | |
69 | - | const replaceRelativeUrls = () => { | |
70 | - | const relativeLinks = [...dom.querySelectorAll('a')] | |
71 | - | .filter((a) => | |
72 | - | a.getAttribute('href')?.length && | |
73 | - | !a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/) | |
74 | - | ); | |
75 | - | ||
76 | - | const relativeImgs = [...dom.querySelectorAll('img')] | |
77 | - | .filter((a) => | |
78 | - | a.getAttribute('src')?.length && | |
79 | - | !a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/) | |
80 | - | ); | |
81 | - | ||
82 | - | [...relativeLinks, ...relativeImgs].forEach((el) => { | |
83 | - | const tag = el.tagName.toLowerCase(); | |
84 | - | const attrName = tag === 'img' ? 'src' : 'href'; | |
85 | - | const attrValue = el.getAttribute(attrName); | |
86 | - | if (attrValue?.startsWith('/')) { | |
87 | - | el.setAttribute(attrName, `${baseUrl}${attrValue}`); | |
88 | - | } else { | |
89 | - | el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`); | |
90 | - | } | |
91 | - | }); | |
92 | - | }; | |
93 | - | ||
94 | - | // This function checks if the current DOM has already been | |
95 | - | // simplified/distilled by the Reader Mode script. If that's the case, then | |
96 | - | // we can directly save the simplified content on the server, and let | |
97 | - | // Wallabag scrape that URL. This ensures that any client-side restrictions | |
98 | - | // that may prevent Wallabag from scraping the original page are bypassed. | |
99 | - | const getSaveUrl = async () => { | |
100 | - | // Check if the current DOM has already been "distilled" by the Mercury script | |
16 | + | const getContent = () => { | |
17 | + | // Check if the current DOM has already been "distilled" by the Mercury script. | |
18 | + | // If that's the case, use the already distilled content as the body of the saved article. | |
101 | 19 | const simplifiedContainer = dom.querySelector('.platypush__simplified-body'); | |
20 | + | return (simplifiedContainer || dom.querySelector('body')).innerHTML; | |
21 | + | }; | |
102 | 22 | ||
103 | - | // If that's not the case, save the original URL as it is | |
104 | - | if (!simplifiedContainer) { | |
105 | - | return url; | |
106 | - | } | |
107 | - | ||
108 | - | // Otherwise, upload the simplified content to a proxy | |
109 | - | const html = document.createElement('html'); | |
110 | - | const head = document.createElement('head'); | |
111 | - | const title = document.createElement('title'); | |
112 | - | const meta = document.createElement('meta'); | |
113 | - | const body = document.createElement('body'); | |
114 | - | const originalLinkDiv = document.createElement('b'); | |
115 | - | const originalLink = document.createElement('a'); | |
116 | - | ||
117 | - | // Replace the relative URLs in the simplified content | |
118 | - | replaceRelativeUrls(); | |
119 | - | ||
120 | - | // Set up the HTML structure | |
121 | - | title.innerText = dom.querySelector('head title')?.innerText; | |
122 | - | meta.setAttribute('charset', 'utf-8'); | |
123 | - | ||
124 | - | // Put a link to the original page in the body | |
125 | - | originalLink.setAttribute('href', url); | |
126 | - | originalLink.setAttribute('target', '_blank'); | |
127 | - | originalLink.innerText = 'Original link'; | |
128 | - | originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`; | |
129 | - | ||
130 | - | // Build the HTML document | |
131 | - | head.appendChild(title); | |
132 | - | head.appendChild(meta); | |
133 | - | body.appendChild(originalLinkDiv); | |
134 | - | body.appendChild(simplifiedContainer); | |
135 | - | html.appendChild(head); | |
136 | - | html.appendChild(body); | |
137 | - | ||
138 | - | // Generate a unique filename based on the URL hash | |
139 | - | const filename = `${hashCode(url)}.html`; | |
140 | - | const outfile = `${savePath}/${filename}`; | |
141 | - | ||
142 | - | // Upload it as HTML to the server | |
143 | - | await app.run({ | |
144 | - | action: 'file.write', | |
145 | - | args: { | |
146 | - | file: outfile, | |
147 | - | content: html.outerHTML, | |
148 | - | }, | |
149 | - | }, args.host); | |
150 | - | ||
151 | - | return `${scrapeUrl}/${filename}`; | |
152 | - | } | |
153 | - | ||
154 | - | // Get the URL to save - either the original one, or the simplified one if | |
155 | - | // the Reader Mode script has already been applied. | |
156 | - | const urlToSave = await getSaveUrl(); | |
157 | - | ||
23 | + | // Save the URL to Wallabag leveraging the Platypush API | |
24 | + | const title = dom.querySelector('head title')?.innerText; | |
158 | 25 | const response = await app.run({ | |
159 | 26 | action: 'wallabag.save', | |
160 | 27 | args: { | |
161 | - | url: urlToSave, | |
28 | + | url: url, | |
29 | + | title: title, | |
30 | + | content: getContent(), | |
162 | 31 | } | |
163 | 32 | }, args.host); | |
164 | 33 | ||
165 | - | // Send a notification to the user with the result of the save operation | |
166 | - | app.notify('Wallabag Save', response.title); | |
167 | - | ||
168 | - | // Optional, if ntfy is enabled, you can send a notification to the user | |
169 | - | // that will be received by any client running ntfy | |
170 | - | // app.run({ | |
171 | - | // action: 'ntfy.send_message', | |
172 | - | // args: { | |
173 | - | // topic: 'wallabag-save-some-random-string', | |
174 | - | // title: 'Saved on Wallabag', | |
175 | - | // message: response.title, | |
176 | - | // url: response.url, | |
177 | - | // } | |
178 | - | // }, args.host); | |
34 | + | /* | |
35 | + | // Optional: Send a notification via ntfy | |
36 | + | await app.run({ | |
37 | + | action: 'ntfy.send_message', | |
38 | + | args: { | |
39 | + | topic: ntfyTopic, | |
40 | + | message: response.title || title, | |
41 | + | title: 'URL saved to Wallabag', | |
42 | + | url: url, | |
43 | + | } | |
44 | + | }, args.host); | |
45 | + | */ | |
179 | 46 | } |
fabio revised this gist . Go to revision
1 file changed, 179 insertions
platypush-ext-save-link.js(file created)
@@ -0,0 +1,179 @@ | |||
1 | + | /** | |
2 | + | * A script for the Platypush browser extension that saves the current page URL to Wallabag. | |
3 | + | * Use together with the Reader Mode script https://gist.manganiello.tech/fabio/c731b57ff6b24d21a8f43fbedde3dc30 for best results. | |
4 | + | */ | |
5 | + | ||
6 | + | // Entry point for the script, which is executed when the user runs the | |
7 | + | // associated action. All the logic should be encapsulated in this function. | |
8 | + | async (app, args) => { | |
9 | + | // This is the base path where the scraped pages will be saved. | |
10 | + | // For sake of simplicity, we will save the scraped pages to a local directory | |
11 | + | // on the same server where the Platypush service is running. | |
12 | + | // If you want to push it to another server, you can replace the call to | |
13 | + | // `file.write` at the bottom of the script with `ssh.put` | |
14 | + | // (https://docs.platypush.tech/platypush/plugins/ssh.html#platypush.plugins.ssh.SshPlugin.put) | |
15 | + | // (ensure that the `ssh` plugin is enabled on your Platypush instance). | |
16 | + | const savePath = `/CHANGEME`; | |
17 | + | ||
18 | + | // This is the URL where the scraped pages will be served from. | |
19 | + | // The simplest way to configure it is to set up a web server that serves | |
20 | + | // the files in the `savePath` directory (python -m http.server should | |
21 | + | // suffice), and then configure a reverse proxy to point to your server - | |
22 | + | // or even configure nginx itself to both serve the files and handle SSL. | |
23 | + | // It is strongly recommended to use HTTPS for this URL, as Wallabag | |
24 | + | // will probably refuse to scrape HTTP URLs. | |
25 | + | const scrapeUrl = 'https://scraped.example.com'; | |
26 | + | ||
27 | + | // Get the page URL and DOM | |
28 | + | const url = await app.getURL(); | |
29 | + | const dom = await app.getDOM(); | |
30 | + | ||
31 | + | // A utility function that generates a unique hash code for a given string. | |
32 | + | // This is used to create a unique filename based on the URL. | |
33 | + | const hashCode = (str) => { | |
34 | + | let hash = 0; | |
35 | + | for (let i = 0, len = str.length; i < len; i++) { | |
36 | + | let chr = str.charCodeAt(i); | |
37 | + | hash = (hash << 5) - hash + chr; | |
38 | + | hash |= 0; // Convert to 32bit integer | |
39 | + | } | |
40 | + | return hash; | |
41 | + | }; | |
42 | + | ||
43 | + | // Utility functions to get the base URL and base relative URL from a given URL string. | |
44 | + | const getBaseUrl = (urlString) => { | |
45 | + | const url = new URL(urlString); | |
46 | + | const protocol = url.protocol; | |
47 | + | const hostname = url.hostname; | |
48 | + | const port = url.port; | |
49 | + | return `${protocol}//${hostname}${port ? ':' + port : ''}`; | |
50 | + | }; | |
51 | + | ||
52 | + | // This function extracts the base relative URL (without the filename) from a given URL string. | |
53 | + | const getBaseRelativeUrl = (urlString) => { | |
54 | + | try { | |
55 | + | let url = new URL(urlString); | |
56 | + | let pathWithoutFilename = url.pathname.substring(0, url.pathname.lastIndexOf('/') + 1); | |
57 | + | return url.origin + pathWithoutFilename; | |
58 | + | } catch (error) { | |
59 | + | return urlString; | |
60 | + | } | |
61 | + | }; | |
62 | + | ||
63 | + | const baseUrl = getBaseUrl(url); | |
64 | + | const baseRelativeUrl = getBaseRelativeUrl(url); | |
65 | + | ||
66 | + | // This function replaces relative URLs in the DOM with absolute URLs based | |
67 | + | // on the original base URL. This is necessary to ensure that links and images | |
68 | + | // point to the correct location when the page is saved or shared. | |
69 | + | const replaceRelativeUrls = () => { | |
70 | + | const relativeLinks = [...dom.querySelectorAll('a')] | |
71 | + | .filter((a) => | |
72 | + | a.getAttribute('href')?.length && | |
73 | + | !a.getAttribute('href')?.match(/^(https?:\/\/)|(javascript:)/) | |
74 | + | ); | |
75 | + | ||
76 | + | const relativeImgs = [...dom.querySelectorAll('img')] | |
77 | + | .filter((a) => | |
78 | + | a.getAttribute('src')?.length && | |
79 | + | !a.getAttribute('src')?.match(/^(https?:\/\/)|(data:image\/)/) | |
80 | + | ); | |
81 | + | ||
82 | + | [...relativeLinks, ...relativeImgs].forEach((el) => { | |
83 | + | const tag = el.tagName.toLowerCase(); | |
84 | + | const attrName = tag === 'img' ? 'src' : 'href'; | |
85 | + | const attrValue = el.getAttribute(attrName); | |
86 | + | if (attrValue?.startsWith('/')) { | |
87 | + | el.setAttribute(attrName, `${baseUrl}${attrValue}`); | |
88 | + | } else { | |
89 | + | el.setAttribute(attrName, `${baseRelativeUrl}${attrValue}`); | |
90 | + | } | |
91 | + | }); | |
92 | + | }; | |
93 | + | ||
94 | + | // This function checks if the current DOM has already been | |
95 | + | // simplified/distilled by the Reader Mode script. If that's the case, then | |
96 | + | // we can directly save the simplified content on the server, and let | |
97 | + | // Wallabag scrape that URL. This ensures that any client-side restrictions | |
98 | + | // that may prevent Wallabag from scraping the original page are bypassed. | |
99 | + | const getSaveUrl = async () => { | |
100 | + | // Check if the current DOM has already been "distilled" by the Mercury script | |
101 | + | const simplifiedContainer = dom.querySelector('.platypush__simplified-body'); | |
102 | + | ||
103 | + | // If that's not the case, save the original URL as it is | |
104 | + | if (!simplifiedContainer) { | |
105 | + | return url; | |
106 | + | } | |
107 | + | ||
108 | + | // Otherwise, upload the simplified content to a proxy | |
109 | + | const html = document.createElement('html'); | |
110 | + | const head = document.createElement('head'); | |
111 | + | const title = document.createElement('title'); | |
112 | + | const meta = document.createElement('meta'); | |
113 | + | const body = document.createElement('body'); | |
114 | + | const originalLinkDiv = document.createElement('b'); | |
115 | + | const originalLink = document.createElement('a'); | |
116 | + | ||
117 | + | // Replace the relative URLs in the simplified content | |
118 | + | replaceRelativeUrls(); | |
119 | + | ||
120 | + | // Set up the HTML structure | |
121 | + | title.innerText = dom.querySelector('head title')?.innerText; | |
122 | + | meta.setAttribute('charset', 'utf-8'); | |
123 | + | ||
124 | + | // Put a link to the original page in the body | |
125 | + | originalLink.setAttribute('href', url); | |
126 | + | originalLink.setAttribute('target', '_blank'); | |
127 | + | originalLink.innerText = 'Original link'; | |
128 | + | originalLinkDiv.innerHTML = `${originalLink.outerHTML}<br>`; | |
129 | + | ||
130 | + | // Build the HTML document | |
131 | + | head.appendChild(title); | |
132 | + | head.appendChild(meta); | |
133 | + | body.appendChild(originalLinkDiv); | |
134 | + | body.appendChild(simplifiedContainer); | |
135 | + | html.appendChild(head); | |
136 | + | html.appendChild(body); | |
137 | + | ||
138 | + | // Generate a unique filename based on the URL hash | |
139 | + | const filename = `${hashCode(url)}.html`; | |
140 | + | const outfile = `${savePath}/${filename}`; | |
141 | + | ||
142 | + | // Upload it as HTML to the server | |
143 | + | await app.run({ | |
144 | + | action: 'file.write', | |
145 | + | args: { | |
146 | + | file: outfile, | |
147 | + | content: html.outerHTML, | |
148 | + | }, | |
149 | + | }, args.host); | |
150 | + | ||
151 | + | return `${scrapeUrl}/${filename}`; | |
152 | + | } | |
153 | + | ||
154 | + | // Get the URL to save - either the original one, or the simplified one if | |
155 | + | // the Reader Mode script has already been applied. | |
156 | + | const urlToSave = await getSaveUrl(); | |
157 | + | ||
158 | + | const response = await app.run({ | |
159 | + | action: 'wallabag.save', | |
160 | + | args: { | |
161 | + | url: urlToSave, | |
162 | + | } | |
163 | + | }, args.host); | |
164 | + | ||
165 | + | // Send a notification to the user with the result of the save operation | |
166 | + | app.notify('Wallabag Save', response.title); | |
167 | + | ||
168 | + | // Optional, if ntfy is enabled, you can send a notification to the user | |
169 | + | // that will be received by any client running ntfy | |
170 | + | // app.run({ | |
171 | + | // action: 'ntfy.send_message', | |
172 | + | // args: { | |
173 | + | // topic: 'wallabag-save-some-random-string', | |
174 | + | // title: 'Saved on Wallabag', | |
175 | + | // message: response.title, | |
176 | + | // url: response.url, | |
177 | + | // } | |
178 | + | // }, args.host); | |
179 | + | } |