Refactor image scraping/saving

2026-06-05 15:09:59 +09:00 · 2019-12-19 12:35:33 -05:00
parent cb0eb31fcf
commit dad90796e2
4 changed files with 46 additions and 48 deletions
@@ -13,59 +13,56 @@ async function parseFilePromise(config) {
 		tagNameProcessors: [xml2js.processors.stripPrefix]
 	});

-	let images = collectImages(data, config);
+	let images = [];
+	if (config.saveattachedimages) {
+		images.push(...collectAttachedImages(data));
+	}
+	if (config.savescrapedimages) {
+		images.push(...collectScrapedImages(data));
+	}
+
 	let posts = collectPosts(data);
 	mergeImagesIntoPosts(images, posts);

 	return Promise.resolve(posts);
 }

-function collectImages(data, config) {
-	// start by collecting all attachment images
+function collectAttachedImages(data) {
 	let images = getItemsOfType(data, 'attachment')
 		// filter to certain image file types
-		.filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0]))
+		.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
 		.map(attachment => ({
 			id: attachment.post_id[0],
 			postId: attachment.post_parent[0],
 			url: attachment.attachment_url[0]
 		}));

-	// optionally add images scraped from <img> tags in post content
-	if (config.addcontentimages) {
-		addContentImages(data, images);
-	}
-
 	return images;
 }

-function addContentImages(data, images) {
-	let regex = (/<img[^>]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi);
-	let match;
+function collectScrapedImages(data) {
+	let images = [];

 	getItemsOfType(data, 'post').forEach(post => {
 		let postId = post.post_id[0];
 		let postContent = post.encoded[0];
 		let postLink = post.link[0];

-		// reset lastIndex since we're reusing the same regex object
-		regex.lastIndex = 0;
-		while ((match = regex.exec(postContent)) !== null) {
+		let matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
+		matches.forEach(match => {
 			// base the matched image URL relative to the post URL
 			let url = new URL(match[1], postLink).href;

-			// add image if it hasn't already been added for this post
-			let exists = images.some(image => image.postId === postId && image.url === url);
-			if (!exists) {
-				images.push({
-					id: -1,
-					postId: postId,
-					url: url
-				});
-				console.log('Scraped ' + url + '.');
-			}
-		}
-	});	
+			images.push({
+				id: -1,
+				postId: postId,
+				url: url
+			});
+			console.log('Scraped ' + url + '.');
+		});
+	});
+
+	return images;
 }

 function collectPosts(data) {
@@ -78,7 +75,8 @@ function collectPosts(data) {
 			meta: {
 				id: getPostId(post),
 				slug: getPostSlug(post),
-				coverImageId: getPostCoverImageId(post)
+				coverImageId: getPostCoverImageId(post),
+				imageUrls: []
 			},
 			frontmatter: {
 				title: getPostTitle(post),
@@ -125,14 +123,15 @@ function mergeImagesIntoPosts(images, posts) {
 	images.forEach(image => {
 		let post = postsLookup[image.postId];
 		if (post) {
-			// save full image URLs for downloading later
-			post.meta.imageUrls = post.meta.imageUrls || [];
-			post.meta.imageUrls.push(image.url);
-
 			if (image.id === post.meta.coverImageId) {
 				// save cover image filename to frontmatter
 				post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
 			}
+			
+			// save (unique) full image URLs for downloading later
+			if (!post.meta.imageUrls.includes(image.url)) {
+				post.meta.imageUrls.push(image.url);
+			}
 		}
 	});
 }
@@ -1,5 +1,7 @@
 const turndown = require('turndown');

+const shared = require('./shared');
+
 function initTurndownService() {
 	let turndownService = new turndown({
 		headingStyle: 'atx',
@@ -32,7 +34,6 @@ function initTurndownService() {
 		filter: 'script',
 		replacement: (content, node) => {
 			let before = '\n\n';
-			let src = node.getAttribute('src');
 			if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
 				// keep twitter and codepen <script> tags snug with the element above them
 				before = '\n';
@@ -63,10 +64,10 @@ function getPostContent(post, turndownService, config) {
 	// without mucking up content inside of other elemnts (like <code> blocks)
 	content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');

-	if (config.addcontentimages) {
+	if (config.savescrapedimages) {
 		// writeImageFile() will save all content images to a relative /images
-		// folder so update references in post content to match
-		content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
+        // folder so update references in post content to match
+		content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpe?g|png))("[^>]*>)/gi, '$1images/$2$3');
 	}

 	// this is a hack to make <iframe> nodes non-empty by inserting a "." which
@@ -13,8 +13,8 @@ function getConfig() {
 			'yearfolders',
 			'postfolders',
 			'prefixdate',
-			'saveimages',
-			'addcontentimages'
+			'saveattachedimages',
+			'savescrapedimages'
 		],
 		default: {
 			input: 'export.xml',
@@ -23,8 +23,8 @@ function getConfig() {
 			yearfolders: false,
 			postfolders: true,
 			prefixdate: false,
-			saveimages: true,
-			addcontentimages: false
+			saveattachedimages: true,
+			savescrapedimages: true
 		}
 	});

@@ -12,14 +12,12 @@ function writeFiles(posts, config) {
 		createDir(postDir);
 		writeMarkdownFile(post, postDir, config);

-		if (config.saveimages && post.meta.imageUrls) {
-			post.meta.imageUrls.forEach(imageUrl => {
-				const imageDir = path.join(postDir, 'images');
-				createDir(imageDir);
-				writeImageFile(imageUrl, imageDir, delay);
-				delay += 25;
-			});
-		}
+        post.meta.imageUrls.forEach(imageUrl => {
+            const imageDir = path.join(postDir, 'images');
+            createDir(imageDir);
+            writeImageFile(imageUrl, imageDir, delay);
+            delay += 25;
+        });
 	});
 }