const fs = require('fs');
const luxon = require('luxon');
const minimist = require('minimist');
const path = require('path');
const request = require('request');
const turndown = require('turndown');
const xml2js = require('xml2js');
// global so various functions can access arguments
let argv;
function init() {
argv = minimist(process.argv.slice(2), {
string: [
'input',
'output'
],
boolean: [
'yearmonthfolders',
'yearfolders',
'postfolders',
'prefixdate',
'saveimages',
'addcontentimages'
],
default: {
input: 'export.xml',
output: 'output',
yearmonthfolders: false,
yearfolders: false,
postfolders: true,
prefixdate: false,
saveimages: true,
addcontentimages: false
}
});
let content = readFile(argv.input);
parseFileContent(content);
}
function readFile(path) {
try {
return fs.readFileSync(path, 'utf8');
} catch (ex) {
console.log('Unable to read file.');
console.log(ex.message);
}
}
function parseFileContent(content) {
const processors = { tagNameProcessors: [ xml2js.processors.stripPrefix ] };
xml2js.parseString(content, processors, (err, data) => {
if (err) {
console.log('Unable to parse file content.');
console.log(err);
} else {
processData(data);
}
});
}
function processData(data) {
let images = collectImages(data);
let posts = collectPosts(data);
mergeImagesIntoPosts(images, posts);
writeFiles(posts);
}
function collectImages(data) {
// start by collecting all attachment images
let images = getItemsOfType(data, 'attachment')
// filter to certain image file types
.filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
id: attachment.post_id[0],
postId: attachment.post_parent[0],
url: attachment.attachment_url[0]
}));
// optionally add images scraped from
tags in post content
if (argv.addcontentimages) {
addContentImages(data, images);
}
return images;
}
function addContentImages(data, images) {
let regex = (/
]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi);
let match;
getItemsOfType(data, 'post').forEach(post => {
let postId = post.post_id[0];
let postContent = post.encoded[0];
let postLink = post.link[0];
// reset lastIndex since we're reusing the same regex object
regex.lastIndex = 0;
while ((match = regex.exec(postContent)) !== null) {
// base the matched image URL relative to the post URL
let url = new URL(match[1], postLink).href;
// add image if it hasn't already been added for this post
let exists = images.some(image => image.postId === postId && image.url === url);
if (!exists) {
images.push({
id: -1,
postId: postId,
url: url
});
console.log('Scraped ' + url + '.');
}
}
});
}
function collectPosts(data) {
// this is passed into getPostContent() for the markdown conversion
turndownService = initTurndownService();
return getItemsOfType(data, 'post')
.map(post => ({
// meta data isn't written to file, but is used to help with other things
meta: {
id: getPostId(post),
slug: getPostSlug(post),
coverImageId: getPostCoverImageId(post)
},
frontmatter: {
title: getPostTitle(post),
date: getPostDate(post)
},
content: getPostContent(post, turndownService)
}));
}
function initTurndownService() {
let turndownService = new turndown({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced'
});
// preserve embedded tweets
turndownService.addRule('tweet', {
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded codepens
turndownService.addRule('codepen', {
filter: node => {
// codepen embed snippets have changed over the years
// but this series of checks should find the commonalities
return (
['P', 'DIV'].includes(node.nodeName) &&
node.attributes['data-slug-hash'] &&
node.getAttribute('class') === 'codepen'
);
},
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded scripts (for tweets, codepens, gists, etc.)
turndownService.addRule('script', {
filter: 'script',
replacement: (content, node) => {
let before = '\n\n';
let src = node.getAttribute('src');
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
// keep twitter and codepen