Merge pull request #105 from lonekorean/more-separator

Preserve more separator HTML comment
This commit is contained in:
Will Boyd
2024-02-21 12:23:38 -05:00
committed by GitHub
3 changed files with 7 additions and 3 deletions
+1 -1
View File
@@ -17,7 +17,7 @@
"luxon": "^3.4.4",
"request": "^2.88.2",
"request-promise-native": "^1.0.8",
"turndown": "^7.0.0",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"xml2js": "^0.4.23"
},
+1 -1
View File
@@ -28,7 +28,7 @@
"luxon": "^3.4.4",
"request": "^2.88.2",
"request-promise-native": "^1.0.8",
"turndown": "^7.0.0",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"xml2js": "^0.4.23"
},
+5 -1
View File
@@ -61,7 +61,7 @@ function getPostContent(post, turndownService, config) {
// insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated
// without mucking up content inside of other elemnts (like <code> blocks)
// without mucking up content inside of other elements (like <code> blocks)
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
if (config.saveScrapedImages) {
@@ -75,6 +75,10 @@ function getPostContent(post, turndownService, config) {
// (using turndown's blankRule() and keep() solution did not work for me)
content = content.replace(/(<\/iframe>)/gi, '.$1');
// preserve "more" separator, max one per post, optionally with custom label
// by escaping angle brackets (will be unescaped during turndown conversion)
content = content.replace(/<(!--more( .*)?--)>/, '&lt;$1&gt;');
// use turndown to convert HTML to Markdown
content = turndownService.turndown(content);