const fs = require('fs'); const luxon = require('luxon'); const minimist = require('minimist'); const path = require('path'); const request = require('request'); const turndown = require('turndown'); const xml2js = require('xml2js'); // global so various functions can access arguments let argv; function init() { argv = minimist(process.argv.slice(2), { string: [ 'input', 'output' ], boolean: [ 'yearmonthfolders', 'yearfolders', 'postfolders', 'prefixdate', 'saveimages', 'addcontentimages' ], default: { input: 'export.xml', output: 'output', yearmonthfolders: false, yearfolders: false, postfolders: true, prefixdate: false, saveimages: true, addcontentimages: false } }); let content = readFile(argv.input); parseFileContent(content); } function readFile(path) { try { return fs.readFileSync(path, 'utf8'); } catch (ex) { console.log('Unable to read file.'); console.log(ex.message); } } function parseFileContent(content) { const processors = { tagNameProcessors: [ xml2js.processors.stripPrefix ] }; xml2js.parseString(content, processors, (err, data) => { if (err) { console.log('Unable to parse file content.'); console.log(err); } else { processData(data); } }); } function processData(data) { let images = collectImages(data); let posts = collectPosts(data); mergeImagesIntoPosts(images, posts); writeFiles(posts); } function collectImages(data) { // start by collecting all attachment images let images = getItemsOfType(data, 'attachment') // filter to certain image file types .filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0])) .map(attachment => ({ id: attachment.post_id[0], postId: attachment.post_parent[0], url: attachment.attachment_url[0] })); // optionally add images scraped from tags in post content if (argv.addcontentimages) { addContentImages(data, images); } return images; } function addContentImages(data, images) { let regex = (/]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi); let match; getItemsOfType(data, 'post').forEach(post => { let postId = post.post_id[0]; let postContent = post.encoded[0]; let postLink = post.link[0]; // reset lastIndex since we're reusing the same regex object regex.lastIndex = 0; while ((match = regex.exec(postContent)) !== null) { // base the matched image URL relative to the post URL let url = new URL(match[1], postLink).href; // add image if it hasn't already been added for this post let exists = images.some(image => image.postId === postId && image.url === url); if (!exists) { images.push({ id: -1, postId: postId, url: url }); console.log('Scraped ' + url + '.'); } } }); } function collectPosts(data) { // this is passed into getPostContent() for the markdown conversion turndownService = initTurndownService(); return getItemsOfType(data, 'post') .map(post => ({ // meta data isn't written to file, but is used to help with other things meta: { id: getPostId(post), slug: getPostSlug(post), coverImageId: getPostCoverImageId(post) }, frontmatter: { title: getPostTitle(post), date: getPostDate(post) }, content: getPostContent(post, turndownService) })); } function initTurndownService() { let turndownService = new turndown({ headingStyle: 'atx', bulletListMarker: '-', codeBlockStyle: 'fenced' }); // preserve embedded tweets turndownService.addRule('tweet', { filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet', replacement: (content, node) => '\n\n' + node.outerHTML }); // preserve embedded codepens turndownService.addRule('codepen', { filter: node => { // codepen embed snippets have changed over the years // but this series of checks should find the commonalities return ( ['P', 'DIV'].includes(node.nodeName) && node.attributes['data-slug-hash'] && node.getAttribute('class') === 'codepen' ); }, replacement: (content, node) => '\n\n' + node.outerHTML }); // preserve embedded scripts (for tweets, codepens, gists, etc.) turndownService.addRule('script', { filter: 'script', replacement: (content, node) => { let before = '\n\n'; let src = node.getAttribute('src'); if (node.previousSibling && node.previousSibling.nodeName !== '#text') { // keep twitter and codepen