mirror of
https://github.com/10h30/wordpress-export-to-markdown.git
synced 2026-06-05 15:09:59 +09:00
Refactor image scraping/saving
This commit is contained in:
+31
-32
@@ -13,59 +13,56 @@ async function parseFilePromise(config) {
|
||||
tagNameProcessors: [xml2js.processors.stripPrefix]
|
||||
});
|
||||
|
||||
let images = collectImages(data, config);
|
||||
let images = [];
|
||||
if (config.saveattachedimages) {
|
||||
images.push(...collectAttachedImages(data));
|
||||
}
|
||||
if (config.savescrapedimages) {
|
||||
images.push(...collectScrapedImages(data));
|
||||
}
|
||||
|
||||
let posts = collectPosts(data);
|
||||
mergeImagesIntoPosts(images, posts);
|
||||
|
||||
return Promise.resolve(posts);
|
||||
}
|
||||
|
||||
function collectImages(data, config) {
|
||||
// start by collecting all attachment images
|
||||
function collectAttachedImages(data) {
|
||||
let images = getItemsOfType(data, 'attachment')
|
||||
// filter to certain image file types
|
||||
.filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0]))
|
||||
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
|
||||
.map(attachment => ({
|
||||
id: attachment.post_id[0],
|
||||
postId: attachment.post_parent[0],
|
||||
url: attachment.attachment_url[0]
|
||||
}));
|
||||
|
||||
// optionally add images scraped from <img> tags in post content
|
||||
if (config.addcontentimages) {
|
||||
addContentImages(data, images);
|
||||
}
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
function addContentImages(data, images) {
|
||||
let regex = (/<img[^>]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi);
|
||||
let match;
|
||||
function collectScrapedImages(data) {
|
||||
let images = [];
|
||||
|
||||
getItemsOfType(data, 'post').forEach(post => {
|
||||
let postId = post.post_id[0];
|
||||
let postContent = post.encoded[0];
|
||||
let postLink = post.link[0];
|
||||
|
||||
// reset lastIndex since we're reusing the same regex object
|
||||
regex.lastIndex = 0;
|
||||
while ((match = regex.exec(postContent)) !== null) {
|
||||
let matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
|
||||
matches.forEach(match => {
|
||||
// base the matched image URL relative to the post URL
|
||||
let url = new URL(match[1], postLink).href;
|
||||
|
||||
// add image if it hasn't already been added for this post
|
||||
let exists = images.some(image => image.postId === postId && image.url === url);
|
||||
if (!exists) {
|
||||
images.push({
|
||||
id: -1,
|
||||
postId: postId,
|
||||
url: url
|
||||
});
|
||||
console.log('Scraped ' + url + '.');
|
||||
}
|
||||
}
|
||||
});
|
||||
images.push({
|
||||
id: -1,
|
||||
postId: postId,
|
||||
url: url
|
||||
});
|
||||
console.log('Scraped ' + url + '.');
|
||||
});
|
||||
});
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
function collectPosts(data) {
|
||||
@@ -78,7 +75,8 @@ function collectPosts(data) {
|
||||
meta: {
|
||||
id: getPostId(post),
|
||||
slug: getPostSlug(post),
|
||||
coverImageId: getPostCoverImageId(post)
|
||||
coverImageId: getPostCoverImageId(post),
|
||||
imageUrls: []
|
||||
},
|
||||
frontmatter: {
|
||||
title: getPostTitle(post),
|
||||
@@ -125,14 +123,15 @@ function mergeImagesIntoPosts(images, posts) {
|
||||
images.forEach(image => {
|
||||
let post = postsLookup[image.postId];
|
||||
if (post) {
|
||||
// save full image URLs for downloading later
|
||||
post.meta.imageUrls = post.meta.imageUrls || [];
|
||||
post.meta.imageUrls.push(image.url);
|
||||
|
||||
if (image.id === post.meta.coverImageId) {
|
||||
// save cover image filename to frontmatter
|
||||
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
|
||||
}
|
||||
|
||||
// save (unique) full image URLs for downloading later
|
||||
if (!post.meta.imageUrls.includes(image.url)) {
|
||||
post.meta.imageUrls.push(image.url);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
+5
-4
@@ -1,5 +1,7 @@
|
||||
const turndown = require('turndown');
|
||||
|
||||
const shared = require('./shared');
|
||||
|
||||
function initTurndownService() {
|
||||
let turndownService = new turndown({
|
||||
headingStyle: 'atx',
|
||||
@@ -32,7 +34,6 @@ function initTurndownService() {
|
||||
filter: 'script',
|
||||
replacement: (content, node) => {
|
||||
let before = '\n\n';
|
||||
let src = node.getAttribute('src');
|
||||
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
|
||||
// keep twitter and codepen <script> tags snug with the element above them
|
||||
before = '\n';
|
||||
@@ -63,10 +64,10 @@ function getPostContent(post, turndownService, config) {
|
||||
// without mucking up content inside of other elemnts (like <code> blocks)
|
||||
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
|
||||
|
||||
if (config.addcontentimages) {
|
||||
if (config.savescrapedimages) {
|
||||
// writeImageFile() will save all content images to a relative /images
|
||||
// folder so update references in post content to match
|
||||
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
|
||||
// folder so update references in post content to match
|
||||
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpe?g|png))("[^>]*>)/gi, '$1images/$2$3');
|
||||
}
|
||||
|
||||
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
|
||||
|
||||
+4
-4
@@ -13,8 +13,8 @@ function getConfig() {
|
||||
'yearfolders',
|
||||
'postfolders',
|
||||
'prefixdate',
|
||||
'saveimages',
|
||||
'addcontentimages'
|
||||
'saveattachedimages',
|
||||
'savescrapedimages'
|
||||
],
|
||||
default: {
|
||||
input: 'export.xml',
|
||||
@@ -23,8 +23,8 @@ function getConfig() {
|
||||
yearfolders: false,
|
||||
postfolders: true,
|
||||
prefixdate: false,
|
||||
saveimages: true,
|
||||
addcontentimages: false
|
||||
saveattachedimages: true,
|
||||
savescrapedimages: true
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
+6
-8
@@ -12,14 +12,12 @@ function writeFiles(posts, config) {
|
||||
createDir(postDir);
|
||||
writeMarkdownFile(post, postDir, config);
|
||||
|
||||
if (config.saveimages && post.meta.imageUrls) {
|
||||
post.meta.imageUrls.forEach(imageUrl => {
|
||||
const imageDir = path.join(postDir, 'images');
|
||||
createDir(imageDir);
|
||||
writeImageFile(imageUrl, imageDir, delay);
|
||||
delay += 25;
|
||||
});
|
||||
}
|
||||
post.meta.imageUrls.forEach(imageUrl => {
|
||||
const imageDir = path.join(postDir, 'images');
|
||||
createDir(imageDir);
|
||||
writeImageFile(imageUrl, imageDir, delay);
|
||||
delay += 25;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user