merge master and move code as needed

This commit is contained in:
Will Boyd
2024-02-27 08:13:37 -05:00
21 changed files with 686 additions and 1864 deletions
+1
View File
@@ -0,0 +1 @@
* text=auto
+1
View File
@@ -0,0 +1 @@
ko_fi: lonekorean
+23
View File
@@ -0,0 +1,23 @@
# How to Contribute
Contributions are welcome! Thank you!
## General Guidelines
Some quick notes when making a pull request.
- Match the style and formatting of the code you are editing.
- Each pull request should be focused on a single thing (a single bug fix, a single feature, etc.). This makes reviewing easier and minimizes merge conflicts.
- Include a description of the problem being solved and what your code does. Steps to reproduce the problem or example input/output are very helpful.
## Adding Options
Keeping the wizard as short as possible is a priority. Pull requests that add options to the wizard will probably not be accepted. Instead, you can add an advanced setting to [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js).
## Adding Frontmatter Fields
Similarly, default frontmatter output is limited to just a few widely used fields to avoid bloat. However, you may add new optional frontmatter fields.
To do so, follow the instructions in [/src/frontmatter/example.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/frontmatter/example.js).
Users will be able to include your new frontmatter field by editing `frontmatter_fields` in [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js).
+1 -1
View File
@@ -165,6 +165,6 @@ Some WordPress sites make use of a `"page"` post type and/or custom post types.
## Advanced Settings
You can edit [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js) to tweak advanced settings. This includes things like throttling image downloads or customizing the date format in frontmatter.
You can edit [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js) to tweak advanced settings. This includes things like customizing frontmatter fields and throttling image downloads.
You'll need to run the script locally (not using `npx`) to make use of advanced settings.
+418 -1758
View File
File diff suppressed because it is too large Load Diff
+5 -4
View File
@@ -1,6 +1,6 @@
{
"name": "wordpress-export-to-markdown",
"version": "2.2.2",
"version": "2.3.1",
"description": "Converts a WordPress export XML file into Markdown files.",
"main": "index.js",
"repository": "https://github.com/lonekorean/wordpress-export-to-markdown.git",
@@ -25,12 +25,13 @@
"commander": "^5.0.0",
"compare-versions": "^3.6.0",
"inquirer": "^7.1.0",
"luxon": "^1.23.0",
"luxon": "^3.4.4",
"request": "^2.88.2",
"request-promise-native": "^1.0.8",
"turndown": "^7.0.0",
"require-directory": "^2.1.1",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"xml2js": "^0.4.23"
"xml2js": "^0.6.2"
},
"devDependencies": {
"eslint": "^6.8.0"
+5
View File
@@ -0,0 +1,5 @@
// get author, without decoding
// WordPress doesn't allow funky characters in usernames anyway
module.exports = (post) => {
return post.data.creator[0];
}
+14
View File
@@ -0,0 +1,14 @@
const settings = require('../settings');
// get array of decoded category names, filtered as specified in settings
module.exports = (post) => {
if (!post.data.category) {
return [];
}
const categories = post.data.category
.filter(category => category.$.domain === 'category')
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
return categories.filter(category => !settings.filter_categories.includes(category));
};
+5
View File
@@ -0,0 +1,5 @@
// get cover image filename, previously decoded and set on post.meta
// this one is unique as it relies on special logic executed by the parser
module.exports = (post) => {
return post.meta.coverImage;
};
+16
View File
@@ -0,0 +1,16 @@
const luxon = require('luxon');
const settings = require('../settings');
// get post date, optionally formatted as specified in settings
module.exports = (post) => {
const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: settings.custom_date_timezone || 'utc' });
if (settings.custom_date_formatting) {
return dateTime.toFormat(settings.custom_date_formatting);
} else if (settings.include_time_with_date) {
return dateTime.toISO();
} else {
return dateTime.toISODate();
}
};
+19
View File
@@ -0,0 +1,19 @@
/*
1. Copy this file, rename to the frontmatter field name you want, camelcased
2. Edit frontmatter_fields in settings.js to include your new field name
3. Run the script to see post data dumps, to see what you can work with
4. Write your code to get and return what you want
5. Update "get whatever" comment to describe what you're getting
6. Remove your field name from frontmatter_fields in settings.js
7. Remove this comment block and the debug console code
8. Make that pull request!
*/
// get whatever
module.exports = (post) => {
console.log('\nBEGIN POST DATA DUMP ===========================================================\n');
console.dir(post, { depth: null });
console.log('\nEND POST DATA DUMP =============================================================\n');
return 'EXAMPLE: ' + post.data.title[0];
};
+4
View File
@@ -0,0 +1,4 @@
// get excerpt, not decoded, newlines collapsed
module.exports = (post) => {
return post.data.encoded[1].replace(/[\r\n]+/gm, ' ');
};
+4
View File
@@ -0,0 +1,4 @@
// get ID
module.exports = (post) => {
return post.data.post_id[0];
}
+4
View File
@@ -0,0 +1,4 @@
// get slug, previously decoded and set on post.meta
module.exports = (post) => {
return post.meta.slug;
};
+12
View File
@@ -0,0 +1,12 @@
// get array of decoded tag names
module.exports = (post) => {
if (!post.data.category) {
return [];
}
const categories = post.data.category
.filter(category => category.$.domain === 'post_tag')
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
return categories;
};
+4
View File
@@ -0,0 +1,4 @@
// get simple post title, but not decoded like other frontmatter string fields
module.exports = (post) => {
return post.data.title[0];
};
+5
View File
@@ -0,0 +1,5 @@
// get type, often this will always be "post"
// but can also be "page" or other custom types
module.exports = (post) => {
return post.data.post_type[0];
}
+62 -75
View File
@@ -1,40 +1,45 @@
const fs = require('fs');
const luxon = require('luxon');
const requireDirectory = require('require-directory');
const xml2js = require('xml2js');
const shared = require('./shared');
const settings = require('./settings');
const translator = require('./translator');
// dynamically requires all frontmatter getters
const frontmatterGetters = requireDirectory(module, './frontmatter', { recurse: false });
async function parseFilePromise(config) {
console.log('\nParsing...');
const content = await fs.promises.readFile(config.input, 'utf8');
const data = await xml2js.parseStringPromise(content, {
const allData = await xml2js.parseStringPromise(content, {
trim: true,
tagNameProcessors: [xml2js.processors.stripPrefix]
});
const channelData = allData.rss.channel[0].item;
const postTypes = getPostTypes(data, config);
const posts = collectPosts(data, postTypes, config);
const postTypes = getPostTypes(channelData, config);
const posts = collectPosts(channelData, postTypes, config);
const images = [];
if (config.saveAttachedImages) {
images.push(...collectAttachedImages(data));
images.push(...collectAttachedImages(channelData));
}
if (config.saveScrapedImages) {
images.push(...collectScrapedImages(data, postTypes));
images.push(...collectScrapedImages(channelData, postTypes));
}
mergeImagesIntoPosts(images, posts);
populateFrontmatter(posts);
return posts;
}
function getPostTypes(data, config) {
function getPostTypes(channelData, config) {
if (config.includeOtherTypes) {
// search export file for all post types minus some default types we don't want
// effectively this will be 'post', 'page', and custom post types
const types = data.rss.channel[0].item
const types = channelData
.map(item => item.post_type[0])
.filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type));
return [...new Set(types)]; // remove duplicates
@@ -44,34 +49,34 @@ function getPostTypes(data, config) {
}
}
function getItemsOfType(data, type) {
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
function getItemsOfType(channelData, type) {
return channelData.filter(item => item.post_type[0] === type);
}
function collectPosts(data, postTypes, config) {
function collectPosts(channelData, postTypes, config) {
// this is passed into getPostContent() for the markdown conversion
const turndownService = translator.initTurndownService();
let allPosts = [];
postTypes.forEach(postType => {
const postsForType = getItemsOfType(data, postType)
.filter(post => post.status[0] !== 'trash' && post.status[0] !== 'draft')
.map(post => ({
const postsForType = getItemsOfType(channelData, postType)
.filter(postData => postData.status[0] !== 'trash' && postData.status[0] !== 'draft')
.map(postData => ({
// raw post data, used by frontmatter getters
data: postData,
// meta data isn't written to file, but is used to help with other things
meta: {
id: getPostId(post),
slug: getPostSlug(post),
coverImageId: getPostCoverImageId(post),
id: getPostId(postData),
slug: getPostSlug(postData),
coverImageId: getPostCoverImageId(postData),
coverImage: undefined, // possibly set later in mergeImagesIntoPosts()
type: postType,
imageUrls: []
imageUrls: [] // possibly set later in mergeImagesIntoPosts()
},
frontmatter: {
title: getPostTitle(post),
date: getPostDate(post),
categories: getCategories(post),
tags: getTags(post)
},
content: translator.getPostContent(post, turndownService, config)
// contents of the post in markdown
content: translator.getPostContent(postData, turndownService, config)
}));
if (postTypes.length > 1) {
@@ -87,61 +92,26 @@ function collectPosts(data, postTypes, config) {
return allPosts;
}
function getPostId(post) {
return post.post_id[0];
function getPostId(postData) {
return postData.post_id[0];
}
function getPostSlug(post) {
return decodeURIComponent(post.post_name[0]);
function getPostSlug(postData) {
return decodeURIComponent(postData.post_name[0]);
}
function getPostCoverImageId(post) {
if (post.postmeta === undefined) {
function getPostCoverImageId(postData) {
if (postData.postmeta === undefined) {
return undefined;
}
const postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
const postmeta = postData.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
const id = postmeta ? postmeta.meta_value[0] : undefined;
return id;
}
function getPostTitle(post) {
return post.title[0];
}
function getPostDate(post) {
const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: settings.custom_date_timezone || 'utc' });
if (settings.custom_date_formatting) {
return dateTime.toFormat(settings.custom_date_formatting);
} else if (settings.include_time_with_date) {
return dateTime.toISO();
} else {
return dateTime.toISODate();
}
}
function getCategories(post) {
const categories = processCategoryTags(post, 'category');
return categories.filter(category => !settings.filter_categories.includes(category));
}
function getTags(post) {
return processCategoryTags(post, 'post_tag');
}
function processCategoryTags(post, domain) {
if (!post.category) {
return [];
}
return post.category
.filter(category => category.$.domain === domain)
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
}
function collectAttachedImages(data) {
const images = getItemsOfType(data, 'attachment')
function collectAttachedImages(channelData) {
const images = getItemsOfType(channelData, 'attachment')
// filter to certain image file types
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
@@ -154,13 +124,13 @@ function collectAttachedImages(data) {
return images;
}
function collectScrapedImages(data, postTypes) {
function collectScrapedImages(channelData, postTypes) {
const images = [];
postTypes.forEach(postType => {
getItemsOfType(data, postType).forEach(post => {
const postId = post.post_id[0];
const postContent = post.encoded[0];
const postLink = post.link[0];
getItemsOfType(channelData, postType).forEach(postData => {
const postId = postData.post_id[0];
const postContent = postData.encoded[0];
const postLink = postData.link[0];
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
matches.forEach(match => {
@@ -192,7 +162,7 @@ function mergeImagesIntoPosts(images, posts) {
// this image was set as the featured image for this post
if (image.id === post.meta.coverImageId) {
shouldAttach = true;
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
post.meta.coverImage = shared.getFilenameFromUrl(image.url);
}
if (shouldAttach && !post.meta.imageUrls.includes(image.url)) {
@@ -202,4 +172,21 @@ function mergeImagesIntoPosts(images, posts) {
});
}
function populateFrontmatter(posts) {
posts.forEach(post => {
const frontmatter = {};
settings.frontmatter_fields.forEach(field => {
[key, alias] = field.split(':');
let frontmatterGetter = frontmatterGetters[key];
if (!frontmatterGetter) {
throw `Could not find a frontmatter getter named "${key}".`;
}
frontmatter[alias || key] = frontmatterGetter(post);
});
post.frontmatter = frontmatter;
});
}
exports.parseFilePromise = parseFilePromise;
+23 -11
View File
@@ -1,22 +1,34 @@
// time in ms to wait between requesting image files
// increase this if you see timeouts or server errors
// Which fields to include in frontmatter. Look in /src/frontmatter to see available fields.
// Order is preserved. If a field has an empty value, it will not be included. You can rename a
// field by providing an alias after a ':'. For example, 'date:created' will include 'date' in
// frontmatter, but renamed to 'created'.
exports.frontmatter_fields = [
'title',
'date',
'categories',
'tags',
'coverImage'
];
// Time in ms to wait between requesting image files. Increase this if you see timeouts or
// server errors.
exports.image_file_request_delay = 500;
// time in ms to wait between saving Markdown files
// increase this if your file system becomes overloaded
// Time in ms to wait between saving Markdown files. Increase this if your file system becomes
// overloaded.
exports.markdown_file_write_delay = 25;
// enable this to include time with post dates
// for example, "2020-12-25" would become "2020-12-25T11:20:35.000Z"
// Enable this to include time with post dates. For example, "2020-12-25" would become
// "2020-12-25T11:20:35.000Z".
exports.include_time_with_date = false;
// override post date formatting with a custom formatting string (for example: 'yyyy LLL dd')
// tokens are documented here: https://moment.github.io/luxon/docs/manual/formatting.html#table-of-tokens
// if set, this takes precedence over include_time_with_date
// Override post date formatting with a custom formatting string (for example: 'yyyy LLL dd').
// Tokens are documented here: https://moment.github.io/luxon/#/parsing?id=table-of-tokens. If
// set, this takes precedence over include_time_with_date.
exports.custom_date_formatting = '';
// categories to be excluded from post frontmatter
// this does not filter out posts themselves, just the categories listed in their frontmatter
// Categories to be excluded from post frontmatter. This does not filter out posts themselves,
// just the categories listed in their frontmatter.
exports.filter_categories = ['uncategorized'];
// override date timezone
+50 -12
View File
@@ -44,24 +44,62 @@ function initTurndownService() {
}
});
// preserve iframes (common for embedded audio/video)
// iframe boolean attributes do not need to be set to empty string
turndownService.addRule('iframe', {
filter: 'iframe',
replacement: (content, node) => {
const html = node.outerHTML.replace('allowfullscreen=""', 'allowfullscreen');
const html = node.outerHTML
.replace('allowfullscreen=""', 'allowfullscreen')
.replace('allowpaymentrequest=""', 'allowpaymentrequest');
return '\n\n' + html + '\n\n';
}
});
// preserve <figure> when it contains a <figcaption>
turndownService.addRule('figure', {
filter: 'figure',
replacement: (content, node) => {
if (node.querySelector('figcaption')) {
// extra newlines are necessary for markdown and HTML to render correctly together
const result = '\n\n<figure>\n\n' + content + '\n\n</figure>\n\n';
return result.replace('\n\n\n\n', '\n\n'); // collapse quadruple newlines
} else {
// does not contain <figcaption>, do not preserve
return content;
}
}
});
// preserve <figcaption>
turndownService.addRule('figcaption', {
filter: 'figcaption',
replacement: (content, node) => {
// extra newlines are necessary for markdown and HTML to render correctly together
return '\n\n<figcaption>\n\n' + content + '\n\n</figcaption>\n\n';
}
});
// convert <pre> into a code block with language when appropriate
turndownService.addRule('pre', {
filter: node => {
// a <pre> with <code> inside will already render nicely, so don't interfere
return node.nodeName === 'PRE' && !node.querySelector('code');
},
replacement: (content, node) => {
const language = node.getAttribute('data-wetm-language') || '';
return '\n\n```' + language + '\n' + node.textContent + '\n```\n\n';
}
});
return turndownService;
}
function getPostContent(post, turndownService, config) {
let content = post.encoded[0];
function getPostContent(postData, turndownService, config) {
let content = postData.encoded[0];
// insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated
// without mucking up content inside of other elemnts (like <code> blocks)
// without mucking up content inside of other elements (like <code> blocks)
content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');
if (config.saveScrapedImages) {
@@ -70,10 +108,13 @@ function getPostContent(post, turndownService, config) {
content = content.replace(/(<img[^>]*src=").*?([^/"]+\.(?:gif|jpe?g|png))("[^>]*>)/gi, '$1images/$2$3');
}
// this is a hack to make <iframe> nodes non-empty by inserting a "." which
// allows the iframe rule declared in initTurndownService() to take effect
// (using turndown's blankRule() and keep() solution did not work for me)
content = content.replace(/(<\/iframe>)/gi, '.$1');
// preserve "more" separator, max one per post, optionally with custom label
// by escaping angle brackets (will be unescaped during turndown conversion)
content = content.replace(/<(!--more( .*)?--)>/, '&lt;$1&gt;');
// some WordPress plugins specify a code language in an HTML comment above a
// <pre> block, save it to a data attribute so the "pre" rule can use it
content = content.replace(/(<!-- wp:.+? \{"language":"(.+?)"\} -->\r?\n<pre )/g, '$1data-wetm-language="$2" ');
// use turndown to convert HTML to Markdown
content = turndownService.turndown(content);
@@ -81,9 +122,6 @@ function getPostContent(post, turndownService, config) {
// clean up extra spaces in list items
content = content.replace(/(-|\d+\.) +/g, '$1 ');
// clean up the "." from the iframe hack above
content = content.replace(/\.(<\/iframe>)/gi, '$1');
return content;
}
+10 -3
View File
@@ -26,7 +26,7 @@ async function processPayloadsPromise(payloads, loadFunc) {
}
}, payload.delay);
}));
const results = await Promise.allSettled(promises);
const failedCount = results.filter(result => result.status === 'rejected').length;
if (failedCount === 0) {
@@ -85,7 +85,9 @@ async function loadMarkdownFilePromise(post) {
} else {
// single string value
const escapedValue = (value || '').replace(/"/g, '\\"');
outputValue = `"${escapedValue}"`;
if (escapedValue.length > 0) {
outputValue = `"${escapedValue}"`;
}
}
if (outputValue !== undefined) {
@@ -157,7 +159,12 @@ async function loadImageFilePromise(imageUrl) {
}
function getPostPath(post, config) {
const dt = luxon.DateTime.fromISO(post.frontmatter.date);
let dt;
if (settings.custom_date_formatting) {
dt = luxon.DateTime.fromFormat(post.frontmatter.date, settings.custom_date_formatting);
} else {
dt = luxon.DateTime.fromISO(post.frontmatter.date);
}
// start with base output dir
const pathSegments = [config.output];