From 0232a5957283c22785f0a7d0a038045154130f16 Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Fri, 28 Feb 2025 14:04:34 -0500 Subject: [PATCH] Better image scraping regex --- src/parser.js | 2 +- src/translator.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser.js b/src/parser.js index e81c56f..4086501 100644 --- a/src/parser.js +++ b/src/parser.js @@ -128,7 +128,7 @@ function collectScrapedImages(allPostData, postTypes) { const postId = postData.childValue('post_id'); const postContent = postData.childValue('encoded'); - const scrapedUrls = [...postContent.matchAll(/]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]); + const scrapedUrls = [...postContent.matchAll(/]+?(?<=\s)src="(.+?)"[^>]*>/gi)].map((match) => match[1]); scrapedUrls.forEach((scrapedUrl) => { let url; if (isAbsoluteUrl(scrapedUrl)) { diff --git a/src/translator.js b/src/translator.js index 6c31b81..2d4661a 100644 --- a/src/translator.js +++ b/src/translator.js @@ -117,7 +117,7 @@ export function getPostContent(content) { if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { // writeImageFile() will save all content images to a relative /images // folder so update references in post content to match - content = content.replace(/(]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3'); + content = content.replace(/(]+?(?<=\s)src=")[^"]*?([^/"]+)("[^>]*>)/gi, '$1images/$2$3'); } // preserve "more" separator, max one per post, optionally with custom label