Better image scraping regex

This commit is contained in:
Will Boyd
2025-02-28 14:04:34 -05:00
parent fbf6fe42da
commit 0232a59572
2 changed files with 2 additions and 2 deletions
+1 -1
View File
@@ -128,7 +128,7 @@ function collectScrapedImages(allPostData, postTypes) {
const postId = postData.childValue('post_id');
const postContent = postData.childValue('encoded');
const scrapedUrls = [...postContent.matchAll(/<img\s[^>]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]);
const scrapedUrls = [...postContent.matchAll(/<img(?=\s)[^>]+?(?<=\s)src="(.+?)"[^>]*>/gi)].map((match) => match[1]);
scrapedUrls.forEach((scrapedUrl) => {
let url;
if (isAbsoluteUrl(scrapedUrl)) {
+1 -1
View File
@@ -117,7 +117,7 @@ export function getPostContent(content) {
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
content = content.replace(/(<img\s[^>]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
content = content.replace(/(<img(?=\s)[^>]+?(?<=\s)src=")[^"]*?([^/"]+)("[^>]*>)/gi, '$1images/$2$3');
}
// preserve "more" separator, max one per post, optionally with custom label