Refactor image scraping/saving

This commit is contained in:
Will Boyd
2019-12-19 12:35:33 -05:00
parent cb0eb31fcf
commit dad90796e2
4 changed files with 46 additions and 48 deletions
+31 -32
View File
@@ -13,59 +13,56 @@ async function parseFilePromise(config) {
tagNameProcessors: [xml2js.processors.stripPrefix]
});
let images = collectImages(data, config);
let images = [];
if (config.saveattachedimages) {
images.push(...collectAttachedImages(data));
}
if (config.savescrapedimages) {
images.push(...collectScrapedImages(data));
}
let posts = collectPosts(data);
mergeImagesIntoPosts(images, posts);
return Promise.resolve(posts);
}
function collectImages(data, config) {
// start by collecting all attachment images
function collectAttachedImages(data) {
let images = getItemsOfType(data, 'attachment')
// filter to certain image file types
.filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0]))
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
id: attachment.post_id[0],
postId: attachment.post_parent[0],
url: attachment.attachment_url[0]
}));
// optionally add images scraped from <img> tags in post content
if (config.addcontentimages) {
addContentImages(data, images);
}
return images;
}
function addContentImages(data, images) {
let regex = (/<img[^>]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi);
let match;
function collectScrapedImages(data) {
let images = [];
getItemsOfType(data, 'post').forEach(post => {
let postId = post.post_id[0];
let postContent = post.encoded[0];
let postLink = post.link[0];
// reset lastIndex since we're reusing the same regex object
regex.lastIndex = 0;
while ((match = regex.exec(postContent)) !== null) {
let matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
matches.forEach(match => {
// base the matched image URL relative to the post URL
let url = new URL(match[1], postLink).href;
// add image if it hasn't already been added for this post
let exists = images.some(image => image.postId === postId && image.url === url);
if (!exists) {
images.push({
id: -1,
postId: postId,
url: url
});
console.log('Scraped ' + url + '.');
}
}
});
images.push({
id: -1,
postId: postId,
url: url
});
console.log('Scraped ' + url + '.');
});
});
return images;
}
function collectPosts(data) {
@@ -78,7 +75,8 @@ function collectPosts(data) {
meta: {
id: getPostId(post),
slug: getPostSlug(post),
coverImageId: getPostCoverImageId(post)
coverImageId: getPostCoverImageId(post),
imageUrls: []
},
frontmatter: {
title: getPostTitle(post),
@@ -125,14 +123,15 @@ function mergeImagesIntoPosts(images, posts) {
images.forEach(image => {
let post = postsLookup[image.postId];
if (post) {
// save full image URLs for downloading later
post.meta.imageUrls = post.meta.imageUrls || [];
post.meta.imageUrls.push(image.url);
if (image.id === post.meta.coverImageId) {
// save cover image filename to frontmatter
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
}
// save (unique) full image URLs for downloading later
if (!post.meta.imageUrls.includes(image.url)) {
post.meta.imageUrls.push(image.url);
}
}
});
}
+5 -4
View File
@@ -1,5 +1,7 @@
const turndown = require('turndown');
const shared = require('./shared');
function initTurndownService() {
let turndownService = new turndown({
headingStyle: 'atx',
@@ -32,7 +34,6 @@ function initTurndownService() {
filter: 'script',
replacement: (content, node) => {
let before = '\n\n';
let src = node.getAttribute('src');
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
// keep twitter and codepen <script> tags snug with the element above them
before = '\n';
@@ -63,10 +64,10 @@ function getPostContent(post, turndownService, config) {
// without mucking up content inside of other elemnts (like <code> blocks)
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
if (config.addcontentimages) {
if (config.savescrapedimages) {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
// folder so update references in post content to match
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpe?g|png))("[^>]*>)/gi, '$1images/$2$3');
}
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
+4 -4
View File
@@ -13,8 +13,8 @@ function getConfig() {
'yearfolders',
'postfolders',
'prefixdate',
'saveimages',
'addcontentimages'
'saveattachedimages',
'savescrapedimages'
],
default: {
input: 'export.xml',
@@ -23,8 +23,8 @@ function getConfig() {
yearfolders: false,
postfolders: true,
prefixdate: false,
saveimages: true,
addcontentimages: false
saveattachedimages: true,
savescrapedimages: true
}
});
+6 -8
View File
@@ -12,14 +12,12 @@ function writeFiles(posts, config) {
createDir(postDir);
writeMarkdownFile(post, postDir, config);
if (config.saveimages && post.meta.imageUrls) {
post.meta.imageUrls.forEach(imageUrl => {
const imageDir = path.join(postDir, 'images');
createDir(imageDir);
writeImageFile(imageUrl, imageDir, delay);
delay += 25;
});
}
post.meta.imageUrls.forEach(imageUrl => {
const imageDir = path.join(postDir, 'images');
createDir(imageDir);
writeImageFile(imageUrl, imageDir, delay);
delay += 25;
});
});
}