From f9e2bc5b0d2e4ceaee4dec56a0dbafa9d9e442f8 Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Sun, 15 Dec 2019 13:44:04 -0500 Subject: [PATCH] Split out code for args, parsing, and shared --- index.js | 293 ++++---------------------------------------------- src/parser.js | 232 +++++++++++++++++++++++++++++++++++++++ src/shared.js | 5 + src/wizard.js | 48 +++++++++ src/writer.js | 0 5 files changed, 305 insertions(+), 273 deletions(-) create mode 100644 src/parser.js create mode 100644 src/shared.js create mode 100644 src/wizard.js create mode 100644 src/writer.js diff --git a/index.js b/index.js index 8f50b13..6cb75b3 100644 --- a/index.js +++ b/index.js @@ -1,275 +1,26 @@ const fs = require('fs'); const luxon = require('luxon'); -const minimist = require('minimist'); const path = require('path'); const request = require('request'); -const turndown = require('turndown'); -const xml2js = require('xml2js'); + +const shared = require('./src/shared'); +const wizard = require('./src/wizard'); +const parser = require('./src/parser'); // global so various functions can access arguments -let argv; +let config; -function init() { - argv = minimist(process.argv.slice(2), { - string: [ - 'input', - 'output' - ], - boolean: [ - 'yearmonthfolders', - 'yearfolders', - 'postfolders', - 'prefixdate', - 'saveimages', - 'addcontentimages' - ], - default: { - input: 'export.xml', - output: 'output', - yearmonthfolders: false, - yearfolders: false, - postfolders: true, - prefixdate: false, - saveimages: true, - addcontentimages: false - } - }); - - let content = readFile(argv.input); - parseFileContent(content); -} - -function readFile(path) { +async function init() { try { - return fs.readFileSync(path, 'utf8'); + config = wizard.getConfig(); + let posts = await parser.parseFilePromise(config) + writeFiles(posts); } catch (ex) { - console.log('Unable to read file.'); - console.log(ex.message); + // appease the UnhandledPromiseRejectionWarning + console.error(ex); } } -function parseFileContent(content) { - const processors = { tagNameProcessors: [ xml2js.processors.stripPrefix ] }; - xml2js.parseString(content, processors, (err, data) => { - if (err) { - console.log('Unable to parse file content.'); - console.log(err); - } else { - processData(data); - } - }); -} - -function processData(data) { - let images = collectImages(data); - let posts = collectPosts(data); - mergeImagesIntoPosts(images, posts); - writeFiles(posts); -} - -function collectImages(data) { - // start by collecting all attachment images - let images = getItemsOfType(data, 'attachment') - // filter to certain image file types - .filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0])) - .map(attachment => ({ - id: attachment.post_id[0], - postId: attachment.post_parent[0], - url: attachment.attachment_url[0] - })); - - // optionally add images scraped from tags in post content - if (argv.addcontentimages) { - addContentImages(data, images); - } - - return images; -} - -function addContentImages(data, images) { - let regex = (/]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi); - let match; - - getItemsOfType(data, 'post').forEach(post => { - let postId = post.post_id[0]; - let postContent = post.encoded[0]; - let postLink = post.link[0]; - - // reset lastIndex since we're reusing the same regex object - regex.lastIndex = 0; - while ((match = regex.exec(postContent)) !== null) { - // base the matched image URL relative to the post URL - let url = new URL(match[1], postLink).href; - - // add image if it hasn't already been added for this post - let exists = images.some(image => image.postId === postId && image.url === url); - if (!exists) { - images.push({ - id: -1, - postId: postId, - url: url - }); - console.log('Scraped ' + url + '.'); - } - } - }); -} - -function collectPosts(data) { - // this is passed into getPostContent() for the markdown conversion - turndownService = initTurndownService(); - - return getItemsOfType(data, 'post') - .map(post => ({ - // meta data isn't written to file, but is used to help with other things - meta: { - id: getPostId(post), - slug: getPostSlug(post), - coverImageId: getPostCoverImageId(post) - }, - frontmatter: { - title: getPostTitle(post), - date: getPostDate(post) - }, - content: getPostContent(post, turndownService) - })); -} - -function initTurndownService() { - let turndownService = new turndown({ - headingStyle: 'atx', - bulletListMarker: '-', - codeBlockStyle: 'fenced' - }); - - // preserve embedded tweets - turndownService.addRule('tweet', { - filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet', - replacement: (content, node) => '\n\n' + node.outerHTML - }); - - // preserve embedded codepens - turndownService.addRule('codepen', { - filter: node => { - // codepen embed snippets have changed over the years - // but this series of checks should find the commonalities - return ( - ['P', 'DIV'].includes(node.nodeName) && - node.attributes['data-slug-hash'] && - node.getAttribute('class') === 'codepen' - ); - }, - replacement: (content, node) => '\n\n' + node.outerHTML - }); - - // preserve embedded scripts (for tweets, codepens, gists, etc.) - turndownService.addRule('script', { - filter: 'script', - replacement: (content, node) => { - let before = '\n\n'; - let src = node.getAttribute('src'); - if (node.previousSibling && node.previousSibling.nodeName !== '#text') { - // keep twitter and codepen