mirror of
https://github.com/10h30/wordpress-export-to-markdown.git
synced 2026-06-05 15:09:59 +09:00
Split out code for writer and translator
This commit is contained in:
@@ -1,11 +1,6 @@
|
||||
const fs = require('fs');
|
||||
const luxon = require('luxon');
|
||||
const path = require('path');
|
||||
const request = require('request');
|
||||
|
||||
const shared = require('./src/shared');
|
||||
const wizard = require('./src/wizard');
|
||||
const parser = require('./src/parser');
|
||||
const writer = require('./src/writer');
|
||||
|
||||
// global so various functions can access arguments
|
||||
let config;
|
||||
@@ -14,115 +9,12 @@ async function init() {
|
||||
try {
|
||||
config = wizard.getConfig();
|
||||
let posts = await parser.parseFilePromise(config)
|
||||
writeFiles(posts);
|
||||
writer.writeFiles(posts, config);
|
||||
} catch (ex) {
|
||||
// appease the UnhandledPromiseRejectionWarning
|
||||
console.error(ex);
|
||||
}
|
||||
}
|
||||
|
||||
function writeFiles(posts) {
|
||||
let delay = 0;
|
||||
posts.forEach(post => {
|
||||
const postDir = getPostDir(post);
|
||||
createDir(postDir);
|
||||
writeMarkdownFile(post, postDir);
|
||||
|
||||
if (config.saveimages && post.meta.imageUrls) {
|
||||
post.meta.imageUrls.forEach(imageUrl => {
|
||||
const imageDir = path.join(postDir, 'images');
|
||||
createDir(imageDir);
|
||||
writeImageFile(imageUrl, imageDir, delay);
|
||||
delay += 25;
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function writeMarkdownFile(post, postDir) {
|
||||
const frontmatter = Object.entries(post.frontmatter)
|
||||
.reduce((accumulator, pair) => {
|
||||
return accumulator + pair[0] + ': "' + pair[1] + '"\n'
|
||||
}, '');
|
||||
const data = '---\n' + frontmatter + '---\n\n' + post.content + '\n';
|
||||
|
||||
const postPath = path.join(postDir, getPostFilename(post));
|
||||
fs.writeFile(postPath, data, (err) => {
|
||||
if (err) {
|
||||
console.log('Unable to write file.')
|
||||
console.log(err);
|
||||
} else {
|
||||
console.log('Wrote ' + postPath + '.');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function writeImageFile(imageUrl, imageDir, delay) {
|
||||
let imagePath = path.join(imageDir, shared.getFilenameFromUrl(imageUrl));
|
||||
let stream = fs.createWriteStream(imagePath);
|
||||
stream.on('finish', () => {
|
||||
console.log('Saved ' + imagePath + '.');
|
||||
});
|
||||
|
||||
// stagger image requests so we don't piss off hosts
|
||||
setTimeout(() => {
|
||||
request
|
||||
.get(imageUrl)
|
||||
.on('response', response => {
|
||||
if (response.statusCode !== 200) {
|
||||
console.log('Response status code ' + response.statusCode + ' received for ' + imageUrl + '.');
|
||||
}
|
||||
})
|
||||
.on('error', err => {
|
||||
console.log('Unable to download image.');
|
||||
console.log(err);
|
||||
})
|
||||
.pipe(stream);
|
||||
}, delay);
|
||||
}
|
||||
|
||||
function createDir(dir) {
|
||||
try {
|
||||
fs.accessSync(dir, fs.constants.F_OK);
|
||||
} catch (ex) {
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
function getPostDir(post) {
|
||||
let dir = config.output;
|
||||
let dt = luxon.DateTime.fromISO(post.frontmatter.date);
|
||||
|
||||
if (config.yearmonthfolders) {
|
||||
dir = path.join(dir, dt.toFormat('yyyy'), dt.toFormat('LL'));
|
||||
} else if (config.yearfolders) {
|
||||
dir = path.join(dir, dt.toFormat('yyyy'));
|
||||
}
|
||||
|
||||
if (config.postfolders) {
|
||||
let folder = post.meta.slug;
|
||||
if (config.prefixdate) {
|
||||
folder = dt.toFormat('yyyy-LL-dd') + '-' + folder;
|
||||
}
|
||||
dir = path.join(dir, folder);
|
||||
}
|
||||
|
||||
return dir;
|
||||
}
|
||||
|
||||
function getPostFilename(post) {
|
||||
if (config.postfolders) {
|
||||
// the containing folder name will be unique, just use index.md here
|
||||
return 'index.md';
|
||||
} else {
|
||||
let filename = post.meta.slug + '.md';
|
||||
if (config.prefixdate) {
|
||||
let dt = luxon.DateTime.fromISO(post.frontmatter.date);
|
||||
filename = dt.toFormat('yyyy-LL-dd') + '-' + filename;
|
||||
}
|
||||
return filename;
|
||||
}
|
||||
}
|
||||
|
||||
// it's go time!
|
||||
init();
|
||||
|
||||
+4
-90
@@ -1,9 +1,9 @@
|
||||
const fs = require('fs');
|
||||
const luxon = require('luxon');
|
||||
const turndown = require('turndown');
|
||||
const xml2js = require('xml2js');
|
||||
|
||||
const shared = require('./shared');
|
||||
const translator = require('./translator');
|
||||
|
||||
let config;
|
||||
|
||||
@@ -76,7 +76,7 @@ function addContentImages(data, images) {
|
||||
|
||||
function collectPosts(data) {
|
||||
// this is passed into getPostContent() for the markdown conversion
|
||||
turndownService = initTurndownService();
|
||||
turndownService = translator.initTurndownService();
|
||||
|
||||
return getItemsOfType(data, 'post')
|
||||
.map(post => ({
|
||||
@@ -90,65 +90,10 @@ function collectPosts(data) {
|
||||
title: getPostTitle(post),
|
||||
date: getPostDate(post)
|
||||
},
|
||||
content: getPostContent(post, turndownService)
|
||||
content: translator.getPostContent(post, turndownService, config)
|
||||
}));
|
||||
}
|
||||
|
||||
function initTurndownService() {
|
||||
let turndownService = new turndown({
|
||||
headingStyle: 'atx',
|
||||
bulletListMarker: '-',
|
||||
codeBlockStyle: 'fenced'
|
||||
});
|
||||
|
||||
// preserve embedded tweets
|
||||
turndownService.addRule('tweet', {
|
||||
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
|
||||
replacement: (content, node) => '\n\n' + node.outerHTML
|
||||
});
|
||||
|
||||
// preserve embedded codepens
|
||||
turndownService.addRule('codepen', {
|
||||
filter: node => {
|
||||
// codepen embed snippets have changed over the years
|
||||
// but this series of checks should find the commonalities
|
||||
return (
|
||||
['P', 'DIV'].includes(node.nodeName) &&
|
||||
node.attributes['data-slug-hash'] &&
|
||||
node.getAttribute('class') === 'codepen'
|
||||
);
|
||||
},
|
||||
replacement: (content, node) => '\n\n' + node.outerHTML
|
||||
});
|
||||
|
||||
// preserve embedded scripts (for tweets, codepens, gists, etc.)
|
||||
turndownService.addRule('script', {
|
||||
filter: 'script',
|
||||
replacement: (content, node) => {
|
||||
let before = '\n\n';
|
||||
let src = node.getAttribute('src');
|
||||
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
|
||||
// keep twitter and codepen <script> tags snug with the element above them
|
||||
before = '\n';
|
||||
}
|
||||
let html = node.outerHTML.replace('async=""', 'async');
|
||||
return before + html + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
// preserve iframes (common for embedded audio/video)
|
||||
turndownService.addRule('iframe', {
|
||||
filter: 'iframe',
|
||||
replacement: (content, node) => {
|
||||
let html = node.outerHTML
|
||||
.replace('allowfullscreen=""', 'allowfullscreen');
|
||||
return '\n\n' + html + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
return turndownService;
|
||||
}
|
||||
|
||||
function getItemsOfType(data, type) {
|
||||
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
|
||||
}
|
||||
@@ -176,37 +121,6 @@ function getPostDate(post) {
|
||||
return luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }).toISODate();
|
||||
}
|
||||
|
||||
function getPostContent(post, turndownService) {
|
||||
let content = post.encoded[0].trim();
|
||||
|
||||
// insert an empty div element between double line breaks
|
||||
// this nifty trick causes turndown to keep adjacent paragraphs separated
|
||||
// without mucking up content inside of other elemnts (like <code> blocks)
|
||||
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
|
||||
|
||||
if (config.addcontentimages) {
|
||||
// writeImageFile() will save all content images to a relative /images
|
||||
// folder so update references in post content to match
|
||||
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
|
||||
}
|
||||
|
||||
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
|
||||
// allows the iframe rule declared in initTurndownService() to take effect
|
||||
// (using turndown's blankRule() and keep() solution did not work for me)
|
||||
content = content.replace(/(<\/iframe>)/gi, '.$1');
|
||||
|
||||
// use turndown to convert HTML to Markdown
|
||||
content = turndownService.turndown(content);
|
||||
|
||||
// clean up extra spaces in list items
|
||||
content = content.replace(/(-|\d+\.) +/g, '$1 ');
|
||||
|
||||
// clean up the "." from the iframe hack above
|
||||
content = content.replace(/\.(<\/iframe>)/gi, '$1');
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
function mergeImagesIntoPosts(images, posts) {
|
||||
// create lookup table for quicker traversal
|
||||
let postsLookup = posts.reduce((lookup, post) => {
|
||||
@@ -229,4 +143,4 @@ function mergeImagesIntoPosts(images, posts) {
|
||||
});
|
||||
}
|
||||
|
||||
exports.parseFilePromise = parseFilePromise;
|
||||
exports.parseFilePromise = parseFilePromise;
|
||||
|
||||
+1
-1
@@ -2,4 +2,4 @@ function getFilenameFromUrl(url) {
|
||||
return url.split('/').slice(-1)[0];
|
||||
}
|
||||
|
||||
exports.getFilenameFromUrl = getFilenameFromUrl;
|
||||
exports.getFilenameFromUrl = getFilenameFromUrl;
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
const turndown = require('turndown');
|
||||
|
||||
function initTurndownService() {
|
||||
let turndownService = new turndown({
|
||||
headingStyle: 'atx',
|
||||
bulletListMarker: '-',
|
||||
codeBlockStyle: 'fenced'
|
||||
});
|
||||
|
||||
// preserve embedded tweets
|
||||
turndownService.addRule('tweet', {
|
||||
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
|
||||
replacement: (content, node) => '\n\n' + node.outerHTML
|
||||
});
|
||||
|
||||
// preserve embedded codepens
|
||||
turndownService.addRule('codepen', {
|
||||
filter: node => {
|
||||
// codepen embed snippets have changed over the years
|
||||
// but this series of checks should find the commonalities
|
||||
return (
|
||||
['P', 'DIV'].includes(node.nodeName) &&
|
||||
node.attributes['data-slug-hash'] &&
|
||||
node.getAttribute('class') === 'codepen'
|
||||
);
|
||||
},
|
||||
replacement: (content, node) => '\n\n' + node.outerHTML
|
||||
});
|
||||
|
||||
// preserve embedded scripts (for tweets, codepens, gists, etc.)
|
||||
turndownService.addRule('script', {
|
||||
filter: 'script',
|
||||
replacement: (content, node) => {
|
||||
let before = '\n\n';
|
||||
let src = node.getAttribute('src');
|
||||
if (node.previousSibling && node.previousSibling.nodeName !== '#text') {
|
||||
// keep twitter and codepen <script> tags snug with the element above them
|
||||
before = '\n';
|
||||
}
|
||||
let html = node.outerHTML.replace('async=""', 'async');
|
||||
return before + html + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
// preserve iframes (common for embedded audio/video)
|
||||
turndownService.addRule('iframe', {
|
||||
filter: 'iframe',
|
||||
replacement: (content, node) => {
|
||||
let html = node.outerHTML
|
||||
.replace('allowfullscreen=""', 'allowfullscreen');
|
||||
return '\n\n' + html + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
return turndownService;
|
||||
}
|
||||
|
||||
function getPostContent(post, turndownService, config) {
|
||||
let content = post.encoded[0].trim();
|
||||
|
||||
// insert an empty div element between double line breaks
|
||||
// this nifty trick causes turndown to keep adjacent paragraphs separated
|
||||
// without mucking up content inside of other elemnts (like <code> blocks)
|
||||
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
|
||||
|
||||
if (config.addcontentimages) {
|
||||
// writeImageFile() will save all content images to a relative /images
|
||||
// folder so update references in post content to match
|
||||
content = content.replace(/(<img[^>]*src=").*?([^\/"]+\.(?:gif|jpg|png))("[^>]*>)/gi, '$1images/$2$3');
|
||||
}
|
||||
|
||||
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
|
||||
// allows the iframe rule declared in initTurndownService() to take effect
|
||||
// (using turndown's blankRule() and keep() solution did not work for me)
|
||||
content = content.replace(/(<\/iframe>)/gi, '.$1');
|
||||
|
||||
// use turndown to convert HTML to Markdown
|
||||
content = turndownService.turndown(content);
|
||||
|
||||
// clean up extra spaces in list items
|
||||
content = content.replace(/(-|\d+\.) +/g, '$1 ');
|
||||
|
||||
// clean up the "." from the iframe hack above
|
||||
content = content.replace(/\.(<\/iframe>)/gi, '$1');
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
exports.initTurndownService = initTurndownService;
|
||||
exports.getPostContent = getPostContent;
|
||||
+1
-1
@@ -45,4 +45,4 @@ function checkFileExists(path) {
|
||||
}
|
||||
}
|
||||
|
||||
exports.getConfig = getConfig;
|
||||
exports.getConfig = getConfig;
|
||||
|
||||
+115
@@ -0,0 +1,115 @@
|
||||
const fs = require('fs');
|
||||
const luxon = require('luxon');
|
||||
const path = require('path');
|
||||
const request = require('request');
|
||||
|
||||
const shared = require('./shared');
|
||||
|
||||
let config;
|
||||
|
||||
function writeFiles(posts, configIn) {
|
||||
config = configIn;
|
||||
|
||||
let delay = 0;
|
||||
posts.forEach(post => {
|
||||
const postDir = getPostDir(post);
|
||||
createDir(postDir);
|
||||
writeMarkdownFile(post, postDir);
|
||||
|
||||
if (config.saveimages && post.meta.imageUrls) {
|
||||
post.meta.imageUrls.forEach(imageUrl => {
|
||||
const imageDir = path.join(postDir, 'images');
|
||||
createDir(imageDir);
|
||||
writeImageFile(imageUrl, imageDir, delay);
|
||||
delay += 25;
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function writeMarkdownFile(post, postDir) {
|
||||
const frontmatter = Object.entries(post.frontmatter)
|
||||
.reduce((accumulator, pair) => {
|
||||
return accumulator + pair[0] + ': "' + pair[1] + '"\n'
|
||||
}, '');
|
||||
const data = '---\n' + frontmatter + '---\n\n' + post.content + '\n';
|
||||
|
||||
const postPath = path.join(postDir, getPostFilename(post));
|
||||
fs.writeFile(postPath, data, (err) => {
|
||||
if (err) {
|
||||
console.log('Unable to write file.')
|
||||
console.log(err);
|
||||
} else {
|
||||
console.log('Wrote ' + postPath + '.');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function writeImageFile(imageUrl, imageDir, delay) {
|
||||
let imagePath = path.join(imageDir, shared.getFilenameFromUrl(imageUrl));
|
||||
let stream = fs.createWriteStream(imagePath);
|
||||
stream.on('finish', () => {
|
||||
console.log('Saved ' + imagePath + '.');
|
||||
});
|
||||
|
||||
// stagger image requests so we don't piss off hosts
|
||||
setTimeout(() => {
|
||||
request
|
||||
.get(imageUrl)
|
||||
.on('response', response => {
|
||||
if (response.statusCode !== 200) {
|
||||
console.log('Response status code ' + response.statusCode + ' received for ' + imageUrl + '.');
|
||||
}
|
||||
})
|
||||
.on('error', err => {
|
||||
console.log('Unable to download image.');
|
||||
console.log(err);
|
||||
})
|
||||
.pipe(stream);
|
||||
}, delay);
|
||||
}
|
||||
|
||||
function createDir(dir) {
|
||||
try {
|
||||
fs.accessSync(dir, fs.constants.F_OK);
|
||||
} catch (ex) {
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
function getPostDir(post) {
|
||||
let dir = config.output;
|
||||
let dt = luxon.DateTime.fromISO(post.frontmatter.date);
|
||||
|
||||
if (config.yearmonthfolders) {
|
||||
dir = path.join(dir, dt.toFormat('yyyy'), dt.toFormat('LL'));
|
||||
} else if (config.yearfolders) {
|
||||
dir = path.join(dir, dt.toFormat('yyyy'));
|
||||
}
|
||||
|
||||
if (config.postfolders) {
|
||||
let folder = post.meta.slug;
|
||||
if (config.prefixdate) {
|
||||
folder = dt.toFormat('yyyy-LL-dd') + '-' + folder;
|
||||
}
|
||||
dir = path.join(dir, folder);
|
||||
}
|
||||
|
||||
return dir;
|
||||
}
|
||||
|
||||
function getPostFilename(post) {
|
||||
if (config.postfolders) {
|
||||
// the containing folder name will be unique, just use index.md here
|
||||
return 'index.md';
|
||||
} else {
|
||||
let filename = post.meta.slug + '.md';
|
||||
if (config.prefixdate) {
|
||||
let dt = luxon.DateTime.fromISO(post.frontmatter.date);
|
||||
filename = dt.toFormat('yyyy-LL-dd') + '-' + filename;
|
||||
}
|
||||
return filename;
|
||||
}
|
||||
}
|
||||
|
||||
exports.writeFiles = writeFiles;
|
||||
|
||||
Reference in New Issue
Block a user