Split out code for args, parsing, and shared

This commit is contained in:
Will Boyd
2019-12-15 13:44:04 -05:00
parent 2024d63aed
commit f9e2bc5b0d
5 changed files with 305 additions and 273 deletions
+20 -273
View File
@@ -1,275 +1,26 @@
const fs = require('fs');
const luxon = require('luxon');
const minimist = require('minimist');
const path = require('path');
const request = require('request');
const turndown = require('turndown');
const xml2js = require('xml2js');
const shared = require('./src/shared');
const wizard = require('./src/wizard');
const parser = require('./src/parser');
// global so various functions can access arguments
let argv;
let config;
function init() {
argv = minimist(process.argv.slice(2), {
string: [
'input',
'output'
],
boolean: [
'yearmonthfolders',
'yearfolders',
'postfolders',
'prefixdate',
'saveimages',
'addcontentimages'
],
default: {
input: 'export.xml',
output: 'output',
yearmonthfolders: false,
yearfolders: false,
postfolders: true,
prefixdate: false,
saveimages: true,
addcontentimages: false
}
});
let content = readFile(argv.input);
parseFileContent(content);
}
function readFile(path) {
async function init() {
try {
return fs.readFileSync(path, 'utf8');
config = wizard.getConfig();
let posts = await parser.parseFilePromise(config)
writeFiles(posts);
} catch (ex) {
console.log('Unable to read file.');
console.log(ex.message);
// appease the UnhandledPromiseRejectionWarning
console.error(ex);
}
}
function parseFileContent(content) {
const processors = { tagNameProcessors: [ xml2js.processors.stripPrefix ] };
xml2js.parseString(content, processors, (err, data) => {
if (err) {
console.log('Unable to parse file content.');
console.log(err);
} else {
processData(data);
}
});
}
function processData(data) {
let images = collectImages(data);
let posts = collectPosts(data);
mergeImagesIntoPosts(images, posts);
writeFiles(posts);
}
function collectImages(data) {
// start by collecting all attachment images
let images = getItemsOfType(data, 'attachment')
// filter to certain image file types
.filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
id: attachment.post_id[0],
postId: attachment.post_parent[0],
url: attachment.attachment_url[0]
}));
// optionally add images scraped from <img> tags in post content
if (argv.addcontentimages) {
addContentImages(data, images);
}
return images;
}
function addContentImages(data, images) {
let regex = (/<img[^>]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi);
let match;
getItemsOfType(data, 'post').forEach(post => {
let postId = post.post_id[0];
let postContent = post.encoded[0];
let postLink = post.link[0];
// reset lastIndex since we're reusing the same regex object
regex.lastIndex = 0;
while ((match = regex.exec(postContent)) !== null) {
// base the matched image URL relative to the post URL
let url = new URL(match[1], postLink).href;
// add image if it hasn't already been added for this post
let exists = images.some(image => image.postId === postId && image.url === url);
if (!exists) {
images.push({
id: -1,
postId: postId,
url: url
});
console.log('Scraped ' + url + '.');
}
}
});
}
function collectPosts(data) {
// this is passed into getPostContent() for the markdown conversion
turndownService = initTurndownService();
return getItemsOfType(data, 'post')
.map(post => ({
// meta data isn't written to file, but is used to help with other things
meta: {
id: getPostId(post),
slug: getPostSlug(post),
coverImageId: getPostCoverImageId(post)
},
frontmatter: {
title: getPostTitle(post),
date: getPostDate(post)
},
content: getPostContent(post, turndownService)
}));
}
function initTurndownService() {
let turndownService = new turndown({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced'
});
// preserve embedded tweets
turndownService.addRule('tweet', {
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded codepens
turndownService.addRule('codepen', {
filter: node => {
// codepen embed snippets have changed over the years
// but this series of checks should find the commonalities
return (
['P', 'DIV'].includes(node.nodeName) &&
node.attributes['data-slug-hash'] &&
node.getAttribute('class') === 'codepen'
);
},
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded scripts (for tweets, codepens, gists, etc.)
turndownService.addRule('script', {
filter: 'script',
replacement: (content, node) => {
let before = '\n\n';
let src = node.getAttribute('src');
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
// keep twitter and codepen <script> tags snug with the element above them
before = '\n';
}
let html = node.outerHTML.replace('async=""', 'async');
return before + html + '\n\n';
}
});
// preserve iframes (common for embedded audio/video)
turndownService.addRule('iframe', {
filter: 'iframe',
replacement: (content, node) => {
let html = node.outerHTML
.replace('allowfullscreen=""', 'allowfullscreen');
return '\n\n' + html + '\n\n';
}
});
return turndownService;
}
function getItemsOfType(data, type) {
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
}
function getPostId(post) {
return post.post_id[0];
}
function getPostCoverImageId(post) {
if (post.postmeta === undefined) return;
let postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
let id = postmeta ? postmeta.meta_value[0] : undefined;
return id;
}
function getPostSlug(post) {
return post.post_name[0];
}
function getPostTitle(post) {
return post.title[0].trim().replace(/"/g, '\\"');
}
function getPostDate(post) {
return luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }).toISODate();
}
function getPostContent(post, turndownService) {
let content = post.encoded[0].trim();
// insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated
// without mucking up content inside of other elemnts (like <code> blocks)
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
if (argv.addcontentimages) {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
}
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
// allows the iframe rule declared in initTurndownService() to take effect
// (using turndown's blankRule() and keep() solution did not work for me)
content = content.replace(/(<\/iframe>)/gi, '.$1');
// use turndown to convert HTML to Markdown
content = turndownService.turndown(content);
// clean up extra spaces in list items
content = content.replace(/(-|\d+\.) +/g, '$1 ');
// clean up the "." from the iframe hack above
content = content.replace(/\.(<\/iframe>)/gi, '$1');
return content;
}
function mergeImagesIntoPosts(images, posts) {
// create lookup table for quicker traversal
let postsLookup = posts.reduce((lookup, post) => {
lookup[post.meta.id] = post;
return lookup;
}, {});
images.forEach(image => {
let post = postsLookup[image.postId];
if (post) {
// save full image URLs for downloading later
post.meta.imageUrls = post.meta.imageUrls || [];
post.meta.imageUrls.push(image.url);
if (image.id === post.meta.coverImageId) {
// save cover image filename to frontmatter
post.frontmatter.coverImage = getFilenameFromUrl(image.url);
}
}
});
}
function writeFiles(posts) {
let delay = 0;
posts.forEach(post => {
@@ -277,7 +28,7 @@ function writeFiles(posts) {
createDir(postDir);
writeMarkdownFile(post, postDir);
if (argv.saveimages && post.meta.imageUrls) {
if (config.saveimages && post.meta.imageUrls) {
post.meta.imageUrls.forEach(imageUrl => {
const imageDir = path.join(postDir, 'images');
createDir(imageDir);
@@ -307,7 +58,7 @@ function writeMarkdownFile(post, postDir) {
}
function writeImageFile(imageUrl, imageDir, delay) {
let imagePath = path.join(imageDir, getFilenameFromUrl(imageUrl));
let imagePath = path.join(imageDir, shared.getFilenameFromUrl(imageUrl));
let stream = fs.createWriteStream(imagePath);
stream.on('finish', () => {
console.log('Saved ' + imagePath + '.');
@@ -330,10 +81,6 @@ function writeImageFile(imageUrl, imageDir, delay) {
}, delay);
}
function getFilenameFromUrl(url) {
return url.split('/').slice(-1)[0];
}
function createDir(dir) {
try {
fs.accessSync(dir, fs.constants.F_OK);
@@ -343,18 +90,18 @@ function createDir(dir) {
}
function getPostDir(post) {
let dir = argv.output;
let dir = config.output;
let dt = luxon.DateTime.fromISO(post.frontmatter.date);
if (argv.yearmonthfolders) {
if (config.yearmonthfolders) {
dir = path.join(dir, dt.toFormat('yyyy'), dt.toFormat('LL'));
} else if (argv.yearfolders) {
} else if (config.yearfolders) {
dir = path.join(dir, dt.toFormat('yyyy'));
}
if (argv.postfolders) {
if (config.postfolders) {
let folder = post.meta.slug;
if (argv.prefixdate) {
if (config.prefixdate) {
folder = dt.toFormat('yyyy-LL-dd') + '-' + folder;
}
dir = path.join(dir, folder);
@@ -364,12 +111,12 @@ function getPostDir(post) {
}
function getPostFilename(post) {
if (argv.postfolders) {
if (config.postfolders) {
// the containing folder name will be unique, just use index.md here
return 'index.md';
} else {
let filename = post.meta.slug + '.md';
if (argv.prefixdate) {
if (config.prefixdate) {
let dt = luxon.DateTime.fromISO(post.frontmatter.date);
filename = dt.toFormat('yyyy-LL-dd') + '-' + filename;
}
+232
View File
@@ -0,0 +1,232 @@
const fs = require('fs');
const luxon = require('luxon');
const turndown = require('turndown');
const xml2js = require('xml2js');
const shared = require('./shared');
let config;
async function parseFilePromise(configIn) {
const content = fs.readFileSync(configIn.input, 'utf8');
const processors = { tagNameProcessors: [xml2js.processors.stripPrefix] };
const data = await xml2js.parseStringPromise(content, processors);
config = configIn;
let posts = processData(data);
return Promise.resolve(posts);
}
function processData(data) {
let images = collectImages(data);
let posts = collectPosts(data);
mergeImagesIntoPosts(images, posts);
return posts;
}
function collectImages(data) {
// start by collecting all attachment images
let images = getItemsOfType(data, 'attachment')
// filter to certain image file types
.filter(attachment => (/\.(gif|jpg|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
id: attachment.post_id[0],
postId: attachment.post_parent[0],
url: attachment.attachment_url[0]
}));
// optionally add images scraped from <img> tags in post content
if (config.addcontentimages) {
addContentImages(data, images);
}
return images;
}
function addContentImages(data, images) {
let regex = (/<img[^>]*src="(.+?\.(?:gif|jpg|png))"[^>]*>/gi);
let match;
getItemsOfType(data, 'post').forEach(post => {
let postId = post.post_id[0];
let postContent = post.encoded[0];
let postLink = post.link[0];
// reset lastIndex since we're reusing the same regex object
regex.lastIndex = 0;
while ((match = regex.exec(postContent)) !== null) {
// base the matched image URL relative to the post URL
let url = new URL(match[1], postLink).href;
// add image if it hasn't already been added for this post
let exists = images.some(image => image.postId === postId && image.url === url);
if (!exists) {
images.push({
id: -1,
postId: postId,
url: url
});
console.log('Scraped ' + url + '.');
}
}
});
}
function collectPosts(data) {
// this is passed into getPostContent() for the markdown conversion
turndownService = initTurndownService();
return getItemsOfType(data, 'post')
.map(post => ({
// meta data isn't written to file, but is used to help with other things
meta: {
id: getPostId(post),
slug: getPostSlug(post),
coverImageId: getPostCoverImageId(post)
},
frontmatter: {
title: getPostTitle(post),
date: getPostDate(post)
},
content: getPostContent(post, turndownService)
}));
}
function initTurndownService() {
let turndownService = new turndown({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced'
});
// preserve embedded tweets
turndownService.addRule('tweet', {
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded codepens
turndownService.addRule('codepen', {
filter: node => {
// codepen embed snippets have changed over the years
// but this series of checks should find the commonalities
return (
['P', 'DIV'].includes(node.nodeName) &&
node.attributes['data-slug-hash'] &&
node.getAttribute('class') === 'codepen'
);
},
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded scripts (for tweets, codepens, gists, etc.)
turndownService.addRule('script', {
filter: 'script',
replacement: (content, node) => {
let before = '\n\n';
let src = node.getAttribute('src');
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
// keep twitter and codepen <script> tags snug with the element above them
before = '\n';
}
let html = node.outerHTML.replace('async=""', 'async');
return before + html + '\n\n';
}
});
// preserve iframes (common for embedded audio/video)
turndownService.addRule('iframe', {
filter: 'iframe',
replacement: (content, node) => {
let html = node.outerHTML
.replace('allowfullscreen=""', 'allowfullscreen');
return '\n\n' + html + '\n\n';
}
});
return turndownService;
}
function getItemsOfType(data, type) {
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
}
function getPostId(post) {
return post.post_id[0];
}
function getPostCoverImageId(post) {
if (post.postmeta === undefined) return;
let postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
let id = postmeta ? postmeta.meta_value[0] : undefined;
return id;
}
function getPostSlug(post) {
return post.post_name[0];
}
function getPostTitle(post) {
return post.title[0].trim().replace(/"/g, '\\"');
}
function getPostDate(post) {
return luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }).toISODate();
}
function getPostContent(post, turndownService) {
let content = post.encoded[0].trim();
// insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated
// without mucking up content inside of other elemnts (like <code> blocks)
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
if (config.addcontentimages) {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
}
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
// allows the iframe rule declared in initTurndownService() to take effect
// (using turndown's blankRule() and keep() solution did not work for me)
content = content.replace(/(<\/iframe>)/gi, '.$1');
// use turndown to convert HTML to Markdown
content = turndownService.turndown(content);
// clean up extra spaces in list items
content = content.replace(/(-|\d+\.) +/g, '$1 ');
// clean up the "." from the iframe hack above
content = content.replace(/\.(<\/iframe>)/gi, '$1');
return content;
}
function mergeImagesIntoPosts(images, posts) {
// create lookup table for quicker traversal
let postsLookup = posts.reduce((lookup, post) => {
lookup[post.meta.id] = post;
return lookup;
}, {});
images.forEach(image => {
let post = postsLookup[image.postId];
if (post) {
// save full image URLs for downloading later
post.meta.imageUrls = post.meta.imageUrls || [];
post.meta.imageUrls.push(image.url);
if (image.id === post.meta.coverImageId) {
// save cover image filename to frontmatter
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
}
}
});
}
exports.parseFilePromise = parseFilePromise;
+5
View File
@@ -0,0 +1,5 @@
function getFilenameFromUrl(url) {
return url.split('/').slice(-1)[0];
}
exports.getFilenameFromUrl = getFilenameFromUrl;
+48
View File
@@ -0,0 +1,48 @@
const fs = require('fs');
const minimist = require('minimist');
function getConfig() {
let args = process.argv.slice(2);
let config = minimist(args, {
string: [
'input',
'output'
],
boolean: [
'yearmonthfolders',
'yearfolders',
'postfolders',
'prefixdate',
'saveimages',
'addcontentimages'
],
default: {
input: 'export.xml',
output: 'output',
yearmonthfolders: false,
yearfolders: false,
postfolders: true,
prefixdate: false,
saveimages: true,
addcontentimages: false
}
});
// TODO: when wizard is implemented user will be asked to repeat input instead of bombing
if (!checkFileExists(config.input)) {
throw new Error('Input file does not exist.');
}
delete config._;
return config;
}
function checkFileExists(path) {
try {
return fs.existsSync(path);
} catch(ex) {
return false;
}
}
exports.getConfig = getConfig;
View File