Split out code for writer and translator

This commit is contained in:
Will Boyd
2019-12-17 13:52:09 -05:00
parent f9e2bc5b0d
commit f4ae769f13
6 changed files with 213 additions and 202 deletions
+4 -90
View File
@@ -1,9 +1,9 @@
const fs = require('fs');
const luxon = require('luxon');
const turndown = require('turndown');
const xml2js = require('xml2js');
const shared = require('./shared');
const translator = require('./translator');
let config;
@@ -76,7 +76,7 @@ function addContentImages(data, images) {
function collectPosts(data) {
// this is passed into getPostContent() for the markdown conversion
turndownService = initTurndownService();
turndownService = translator.initTurndownService();
return getItemsOfType(data, 'post')
.map(post => ({
@@ -90,65 +90,10 @@ function collectPosts(data) {
title: getPostTitle(post),
date: getPostDate(post)
},
content: getPostContent(post, turndownService)
content: translator.getPostContent(post, turndownService, config)
}));
}
function initTurndownService() {
let turndownService = new turndown({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced'
});
// preserve embedded tweets
turndownService.addRule('tweet', {
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded codepens
turndownService.addRule('codepen', {
filter: node => {
// codepen embed snippets have changed over the years
// but this series of checks should find the commonalities
return (
['P', 'DIV'].includes(node.nodeName) &&
node.attributes['data-slug-hash'] &&
node.getAttribute('class') === 'codepen'
);
},
replacement: (content, node) => '\n\n' + node.outerHTML
});
// preserve embedded scripts (for tweets, codepens, gists, etc.)
turndownService.addRule('script', {
filter: 'script',
replacement: (content, node) => {
let before = '\n\n';
let src = node.getAttribute('src');
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
// keep twitter and codepen <script> tags snug with the element above them
before = '\n';
}
let html = node.outerHTML.replace('async=""', 'async');
return before + html + '\n\n';
}
});
// preserve iframes (common for embedded audio/video)
turndownService.addRule('iframe', {
filter: 'iframe',
replacement: (content, node) => {
let html = node.outerHTML
.replace('allowfullscreen=""', 'allowfullscreen');
return '\n\n' + html + '\n\n';
}
});
return turndownService;
}
function getItemsOfType(data, type) {
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
}
@@ -176,37 +121,6 @@ function getPostDate(post) {
return luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }).toISODate();
}
function getPostContent(post, turndownService) {
let content = post.encoded[0].trim();
// insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated
// without mucking up content inside of other elemnts (like <code> blocks)
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
if (config.addcontentimages) {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
}
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
// allows the iframe rule declared in initTurndownService() to take effect
// (using turndown's blankRule() and keep() solution did not work for me)
content = content.replace(/(<\/iframe>)/gi, '.$1');
// use turndown to convert HTML to Markdown
content = turndownService.turndown(content);
// clean up extra spaces in list items
content = content.replace(/(-|\d+\.) +/g, '$1 ');
// clean up the "." from the iframe hack above
content = content.replace(/\.(<\/iframe>)/gi, '$1');
return content;
}
function mergeImagesIntoPosts(images, posts) {
// create lookup table for quicker traversal
let postsLookup = posts.reduce((lookup, post) => {
@@ -229,4 +143,4 @@ function mergeImagesIntoPosts(images, posts) {
});
}
exports.parseFilePromise = parseFilePromise;
exports.parseFilePromise = parseFilePromise;