2019-12-15 13:44:04 -05:00
|
|
|
const fs = require('fs');
|
|
|
|
|
const luxon = require('luxon');
|
|
|
|
|
const xml2js = require('xml2js');
|
|
|
|
|
|
|
|
|
|
const shared = require('./shared');
|
2019-12-17 13:52:09 -05:00
|
|
|
const translator = require('./translator');
|
2019-12-15 13:44:04 -05:00
|
|
|
|
2019-12-17 14:03:58 -05:00
|
|
|
async function parseFilePromise(config) {
|
2019-12-25 14:25:13 -05:00
|
|
|
console.log('\nParsing export file...');
|
2019-12-21 15:57:25 -05:00
|
|
|
const content = await fs.promises.readFile(config.input, 'utf8');
|
2019-12-18 16:36:43 -05:00
|
|
|
const data = await xml2js.parseStringPromise(content, {
|
|
|
|
|
trim: true,
|
|
|
|
|
tagNameProcessors: [xml2js.processors.stripPrefix]
|
|
|
|
|
});
|
2019-12-15 13:44:04 -05:00
|
|
|
|
2020-01-12 09:03:32 -05:00
|
|
|
let posts = collectPosts(data, config);
|
2019-12-19 13:17:43 -05:00
|
|
|
|
2019-12-19 12:35:33 -05:00
|
|
|
let images = [];
|
2020-01-12 09:03:32 -05:00
|
|
|
if (config.saveAttachedImages) {
|
2019-12-19 12:35:33 -05:00
|
|
|
images.push(...collectAttachedImages(data));
|
|
|
|
|
}
|
2020-01-12 09:03:32 -05:00
|
|
|
if (config.saveScrapedImages) {
|
2019-12-19 12:35:33 -05:00
|
|
|
images.push(...collectScrapedImages(data));
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-15 13:44:04 -05:00
|
|
|
mergeImagesIntoPosts(images, posts);
|
2019-12-17 14:03:58 -05:00
|
|
|
|
2019-12-21 15:57:25 -05:00
|
|
|
return posts;
|
2019-12-15 13:44:04 -05:00
|
|
|
}
|
|
|
|
|
|
2019-12-19 13:17:43 -05:00
|
|
|
function getItemsOfType(data, type) {
|
|
|
|
|
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
|
2019-12-15 13:44:04 -05:00
|
|
|
}
|
|
|
|
|
|
2020-01-12 09:03:32 -05:00
|
|
|
function collectPosts(data, config) {
|
2019-12-15 13:44:04 -05:00
|
|
|
// this is passed into getPostContent() for the markdown conversion
|
2019-12-17 13:52:09 -05:00
|
|
|
turndownService = translator.initTurndownService();
|
2019-12-15 13:44:04 -05:00
|
|
|
|
2019-12-19 13:17:43 -05:00
|
|
|
let posts = getItemsOfType(data, 'post')
|
2019-12-15 13:44:04 -05:00
|
|
|
.map(post => ({
|
|
|
|
|
// meta data isn't written to file, but is used to help with other things
|
|
|
|
|
meta: {
|
|
|
|
|
id: getPostId(post),
|
|
|
|
|
slug: getPostSlug(post),
|
2019-12-19 12:35:33 -05:00
|
|
|
coverImageId: getPostCoverImageId(post),
|
|
|
|
|
imageUrls: []
|
2019-12-15 13:44:04 -05:00
|
|
|
},
|
|
|
|
|
frontmatter: {
|
|
|
|
|
title: getPostTitle(post),
|
|
|
|
|
date: getPostDate(post)
|
|
|
|
|
},
|
2019-12-17 13:52:09 -05:00
|
|
|
content: translator.getPostContent(post, turndownService, config)
|
2019-12-15 13:44:04 -05:00
|
|
|
}));
|
|
|
|
|
|
2019-12-19 13:17:43 -05:00
|
|
|
console.log(posts.length + ' posts found.');
|
|
|
|
|
return posts;
|
2019-12-15 13:44:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getPostId(post) {
|
|
|
|
|
return post.post_id[0];
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-12 13:50:50 -05:00
|
|
|
function getPostSlug(post) {
|
|
|
|
|
return post.post_name[0];
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-15 13:44:04 -05:00
|
|
|
function getPostCoverImageId(post) {
|
2020-01-12 13:50:50 -05:00
|
|
|
if (post.postmeta === undefined) {
|
|
|
|
|
return undefined;
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-15 13:44:04 -05:00
|
|
|
let postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
|
|
|
|
|
let id = postmeta ? postmeta.meta_value[0] : undefined;
|
|
|
|
|
return id;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getPostTitle(post) {
|
2019-12-21 15:57:25 -05:00
|
|
|
return post.title[0];
|
2019-12-15 13:44:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getPostDate(post) {
|
|
|
|
|
return luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }).toISODate();
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-19 13:17:43 -05:00
|
|
|
function collectAttachedImages(data) {
|
|
|
|
|
let images = getItemsOfType(data, 'attachment')
|
|
|
|
|
// filter to certain image file types
|
|
|
|
|
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
|
|
|
|
|
.map(attachment => ({
|
|
|
|
|
id: attachment.post_id[0],
|
|
|
|
|
postId: attachment.post_parent[0],
|
|
|
|
|
url: attachment.attachment_url[0]
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
console.log(images.length + ' attached images found.');
|
|
|
|
|
return images;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function collectScrapedImages(data) {
|
|
|
|
|
let images = [];
|
|
|
|
|
getItemsOfType(data, 'post').forEach(post => {
|
|
|
|
|
let postId = post.post_id[0];
|
|
|
|
|
let postContent = post.encoded[0];
|
|
|
|
|
let postLink = post.link[0];
|
|
|
|
|
|
|
|
|
|
let matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
|
|
|
|
|
matches.forEach(match => {
|
|
|
|
|
// base the matched image URL relative to the post URL
|
|
|
|
|
let url = new URL(match[1], postLink).href;
|
|
|
|
|
|
|
|
|
|
images.push({
|
|
|
|
|
id: -1,
|
|
|
|
|
postId: postId,
|
|
|
|
|
url: url
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
console.log(images.length + ' images scraped from post body content.');
|
|
|
|
|
return images;
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-15 13:44:04 -05:00
|
|
|
function mergeImagesIntoPosts(images, posts) {
|
|
|
|
|
// create lookup table for quicker traversal
|
|
|
|
|
let postsLookup = posts.reduce((lookup, post) => {
|
|
|
|
|
lookup[post.meta.id] = post;
|
|
|
|
|
return lookup;
|
|
|
|
|
}, {});
|
|
|
|
|
|
|
|
|
|
images.forEach(image => {
|
|
|
|
|
let post = postsLookup[image.postId];
|
|
|
|
|
if (post) {
|
|
|
|
|
if (image.id === post.meta.coverImageId) {
|
|
|
|
|
// save cover image filename to frontmatter
|
|
|
|
|
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
|
|
|
|
|
}
|
2019-12-19 12:35:33 -05:00
|
|
|
|
|
|
|
|
// save (unique) full image URLs for downloading later
|
|
|
|
|
if (!post.meta.imageUrls.includes(image.url)) {
|
|
|
|
|
post.meta.imageUrls.push(image.url);
|
|
|
|
|
}
|
2019-12-15 13:44:04 -05:00
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2019-12-17 13:52:09 -05:00
|
|
|
exports.parseFilePromise = parseFilePromise;
|