merge master and move code as needed

2026-06-05 15:09:59 +09:00 · 2024-02-27 08:13:37 -05:00
parent 914ab1c1ed 873ce425a9
commit dce423ac16
21 changed files with 686 additions and 1864 deletions
@@ -0,0 +1 @@
+* text=auto
@@ -0,0 +1 @@
+ko_fi: lonekorean
@@ -0,0 +1,23 @@
+# How to Contribute
+
+Contributions are welcome! Thank you!
+
+## General Guidelines
+
+Some quick notes when making a pull request.
+
+- Match the style and formatting of the code you are editing.
+- Each pull request should be focused on a single thing (a single bug fix, a single feature, etc.). This makes reviewing easier and minimizes merge conflicts.
+- Include a description of the problem being solved and what your code does. Steps to reproduce the problem or example input/output are very helpful.
+
+## Adding Options
+
+Keeping the wizard as short as possible is a priority. Pull requests that add options to the wizard will probably not be accepted. Instead, you can add an advanced setting to [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js).
+
+## Adding Frontmatter Fields
+
+Similarly, default frontmatter output is limited to just a few widely used fields to avoid bloat. However, you may add new optional frontmatter fields.
+
+To do so, follow the instructions in [/src/frontmatter/example.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/frontmatter/example.js).
+
+Users will be able to include your new frontmatter field by editing `frontmatter_fields` in [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js).
@@ -165,6 +165,6 @@ Some WordPress sites make use of a `"page"` post type and/or custom post types.

 ## Advanced Settings

-You can edit [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js) to tweak advanced settings. This includes things like throttling image downloads or customizing the date format in frontmatter.
+You can edit [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js) to tweak advanced settings. This includes things like customizing frontmatter fields and throttling image downloads.

 You'll need to run the script locally (not using `npx`) to make use of advanced settings.
@@ -1,6 +1,6 @@
 {
 	"name": "wordpress-export-to-markdown",
-	"version": "2.2.2",
+	"version": "2.3.1",
 	"description": "Converts a WordPress export XML file into Markdown files.",
 	"main": "index.js",
 	"repository": "https://github.com/lonekorean/wordpress-export-to-markdown.git",
@@ -25,12 +25,13 @@
 		"commander": "^5.0.0",
 		"compare-versions": "^3.6.0",
 		"inquirer": "^7.1.0",
-		"luxon": "^1.23.0",
+		"luxon": "^3.4.4",
 		"request": "^2.88.2",
 		"request-promise-native": "^1.0.8",
-		"turndown": "^7.0.0",
+		"require-directory": "^2.1.1",
+		"turndown": "^7.1.2",
 		"turndown-plugin-gfm": "^1.0.2",
-		"xml2js": "^0.4.23"
+		"xml2js": "^0.6.2"
 	},
 	"devDependencies": {
 		"eslint": "^6.8.0"
@@ -0,0 +1,5 @@
+// get author, without decoding
+// WordPress doesn't allow funky characters in usernames anyway
+module.exports = (post) => {
+	return post.data.creator[0];
+}
@@ -0,0 +1,14 @@
+const settings = require('../settings');
+
+// get array of decoded category names, filtered as specified in settings
+module.exports = (post) => {
+	if (!post.data.category) {
+		return [];
+	}
+
+	const categories = post.data.category
+		.filter(category => category.$.domain === 'category')
+		.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
+
+	return categories.filter(category => !settings.filter_categories.includes(category));
+};
@@ -0,0 +1,5 @@
+// get cover image filename, previously decoded and set on post.meta
+// this one is unique as it relies on special logic executed by the parser
+module.exports = (post) => {
+	return post.meta.coverImage;
+};
@@ -0,0 +1,16 @@
+const luxon = require('luxon');
+
+const settings = require('../settings');
+
+// get post date, optionally formatted as specified in settings
+module.exports = (post) => {
+	const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: settings.custom_date_timezone || 'utc' });
+
+	if (settings.custom_date_formatting) {
+		return dateTime.toFormat(settings.custom_date_formatting);
+	} else if (settings.include_time_with_date) {
+		return dateTime.toISO();
+	} else {
+		return dateTime.toISODate();
+	}
+};
@@ -0,0 +1,19 @@
+/*
+	1. Copy this file, rename to the frontmatter field name you want, camelcased
+	2. Edit frontmatter_fields in settings.js to include your new field name
+	3. Run the script to see post data dumps, to see what you can work with
+	4. Write your code to get and return what you want
+	5. Update "get whatever" comment to describe what you're getting
+	6. Remove your field name from frontmatter_fields in settings.js
+	7. Remove this comment block and the debug console code
+	8. Make that pull request!
+*/
+
+// get whatever
+module.exports = (post) => {
+	console.log('\nBEGIN POST DATA DUMP ===========================================================\n');
+	console.dir(post, { depth: null });
+	console.log('\nEND POST DATA DUMP =============================================================\n');
+
+	return 'EXAMPLE: ' + post.data.title[0];
+};
@@ -0,0 +1,4 @@
+// get excerpt, not decoded, newlines collapsed
+module.exports = (post) => {
+	return post.data.encoded[1].replace(/[\r\n]+/gm, ' ');
+};
@@ -0,0 +1,4 @@
+// get ID
+module.exports = (post) => {
+	return post.data.post_id[0];
+}
@@ -0,0 +1,4 @@
+// get slug, previously decoded and set on post.meta
+module.exports = (post) => {
+	return post.meta.slug;
+};
@@ -0,0 +1,12 @@
+// get array of decoded tag names
+module.exports = (post) => {
+	if (!post.data.category) {
+		return [];
+	}
+
+	const categories = post.data.category
+		.filter(category => category.$.domain === 'post_tag')
+		.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
+
+	return categories;
+};
@@ -0,0 +1,4 @@
+// get simple post title, but not decoded like other frontmatter string fields
+module.exports = (post) => {
+	return post.data.title[0];
+};
@@ -0,0 +1,5 @@
+// get type, often this will always be "post"
+// but can also be "page" or other custom types
+module.exports = (post) => {
+	return post.data.post_type[0];
+}
@@ -1,40 +1,45 @@
 const fs = require('fs');
-const luxon = require('luxon');
+const requireDirectory = require('require-directory');
 const xml2js = require('xml2js');

 const shared = require('./shared');
 const settings = require('./settings');
 const translator = require('./translator');

+// dynamically requires all frontmatter getters
+const frontmatterGetters = requireDirectory(module, './frontmatter', { recurse: false });
+
 async function parseFilePromise(config) {
 	console.log('\nParsing...');
 	const content = await fs.promises.readFile(config.input, 'utf8');
-	const data = await xml2js.parseStringPromise(content, {
+	const allData = await xml2js.parseStringPromise(content, {
 		trim: true,
 		tagNameProcessors: [xml2js.processors.stripPrefix]
 	});
+	const channelData = allData.rss.channel[0].item;

-	const postTypes = getPostTypes(data, config);
-	const posts = collectPosts(data, postTypes, config);
+	const postTypes = getPostTypes(channelData, config);
+	const posts = collectPosts(channelData, postTypes, config);

 	const images = [];
 	if (config.saveAttachedImages) {
-		images.push(...collectAttachedImages(data));
+		images.push(...collectAttachedImages(channelData));
 	}
 	if (config.saveScrapedImages) {
-		images.push(...collectScrapedImages(data, postTypes));
+		images.push(...collectScrapedImages(channelData, postTypes));
 	}

 	mergeImagesIntoPosts(images, posts);
+	populateFrontmatter(posts);

 	return posts;
 }

-function getPostTypes(data, config) {
+function getPostTypes(channelData, config) {
 	if (config.includeOtherTypes) {
 		// search export file for all post types minus some default types we don't want
 		// effectively this will be 'post', 'page', and custom post types
-		const types = data.rss.channel[0].item
+		const types = channelData
 			.map(item => item.post_type[0])
 			.filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type));
 		return [...new Set(types)]; // remove duplicates
@@ -44,34 +49,34 @@ function getPostTypes(data, config) {
 	}
 }

-function getItemsOfType(data, type) {
-	return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
+function getItemsOfType(channelData, type) {
+	return channelData.filter(item => item.post_type[0] === type);
 }

-function collectPosts(data, postTypes, config) {
+function collectPosts(channelData, postTypes, config) {
 	// this is passed into getPostContent() for the markdown conversion
 	const turndownService = translator.initTurndownService();

 	let allPosts = [];
 	postTypes.forEach(postType => {
-		const postsForType = getItemsOfType(data, postType)
-			.filter(post => post.status[0] !== 'trash' && post.status[0] !== 'draft')
-			.map(post => ({
+		const postsForType = getItemsOfType(channelData, postType)
+			.filter(postData => postData.status[0] !== 'trash' && postData.status[0] !== 'draft')
+			.map(postData => ({
+				// raw post data, used by frontmatter getters
+				data: postData,
+
 				// meta data isn't written to file, but is used to help with other things
 				meta: {
-					id: getPostId(post),
-					slug: getPostSlug(post),
-					coverImageId: getPostCoverImageId(post),
+					id: getPostId(postData),
+					slug: getPostSlug(postData),
+					coverImageId: getPostCoverImageId(postData),
+					coverImage: undefined, // possibly set later in mergeImagesIntoPosts()
 					type: postType,
-					imageUrls: []
+					imageUrls: [] // possibly set later in mergeImagesIntoPosts()
 				},
-				frontmatter: {
-					title: getPostTitle(post),
-					date: getPostDate(post),
-					categories: getCategories(post),
-					tags: getTags(post)
-				},
-				content: translator.getPostContent(post, turndownService, config)
+
+				// contents of the post in markdown
+				content: translator.getPostContent(postData, turndownService, config)
 			}));

 		if (postTypes.length > 1) {
@@ -87,61 +92,26 @@ function collectPosts(data, postTypes, config) {
 	return allPosts;
 }

-function getPostId(post) {
-	return post.post_id[0];
+function getPostId(postData) {
+	return postData.post_id[0];
 }

-function getPostSlug(post) {
-	return decodeURIComponent(post.post_name[0]);
+function getPostSlug(postData) {
+	return decodeURIComponent(postData.post_name[0]);
 }

-function getPostCoverImageId(post) {
-	if (post.postmeta === undefined) {
+function getPostCoverImageId(postData) {
+	if (postData.postmeta === undefined) {
 		return undefined;
 	}

-	const postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
+	const postmeta = postData.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
 	const id = postmeta ? postmeta.meta_value[0] : undefined;
 	return id;
 }

-function getPostTitle(post) {
-	return post.title[0];
-}
-
-function getPostDate(post) {
-	const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: settings.custom_date_timezone || 'utc' });
-
-	if (settings.custom_date_formatting) {
-		return dateTime.toFormat(settings.custom_date_formatting);
-	} else if (settings.include_time_with_date) {
-		return dateTime.toISO();
-	} else {
-		return dateTime.toISODate();
-	}
-}
-
-function getCategories(post) {
-	const categories = processCategoryTags(post, 'category');
-	return categories.filter(category => !settings.filter_categories.includes(category));
-}
-
-function getTags(post) {
-	return processCategoryTags(post, 'post_tag');
-}
-
-function processCategoryTags(post, domain) {
-	if (!post.category) {
-		return [];
-	}
-
-	return post.category
-		.filter(category => category.$.domain === domain)
-		.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
-}
-
-function collectAttachedImages(data) {
-	const images = getItemsOfType(data, 'attachment')
+function collectAttachedImages(channelData) {
+	const images = getItemsOfType(channelData, 'attachment')
 		// filter to certain image file types
 		.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
 		.map(attachment => ({
@@ -154,13 +124,13 @@ function collectAttachedImages(data) {
 	return images;
 }

-function collectScrapedImages(data, postTypes) {
+function collectScrapedImages(channelData, postTypes) {
 	const images = [];
 	postTypes.forEach(postType => {
-		getItemsOfType(data, postType).forEach(post => {
-			const postId = post.post_id[0];
-			const postContent = post.encoded[0];
-			const postLink = post.link[0];
+		getItemsOfType(channelData, postType).forEach(postData => {
+			const postId = postData.post_id[0];
+			const postContent = postData.encoded[0];
+			const postLink = postData.link[0];

 			const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
 			matches.forEach(match => {
@@ -192,7 +162,7 @@ function mergeImagesIntoPosts(images, posts) {
 			// this image was set as the featured image for this post
 			if (image.id === post.meta.coverImageId) {
 				shouldAttach = true;
-				post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
+				post.meta.coverImage = shared.getFilenameFromUrl(image.url);
 			}

 			if (shouldAttach && !post.meta.imageUrls.includes(image.url)) {
@@ -202,4 +172,21 @@ function mergeImagesIntoPosts(images, posts) {
 	});
 }

+function populateFrontmatter(posts) {
+	posts.forEach(post => {
+		const frontmatter = {};
+		settings.frontmatter_fields.forEach(field => {
+			[key, alias] = field.split(':');
+
+			let frontmatterGetter = frontmatterGetters[key];
+			if (!frontmatterGetter) {
+				throw `Could not find a frontmatter getter named "${key}".`;
+			}
+
+			frontmatter[alias || key] = frontmatterGetter(post);
+		});
+		post.frontmatter = frontmatter;
+	});
+}
+
 exports.parseFilePromise = parseFilePromise;
@@ -1,22 +1,34 @@
-// time in ms to wait between requesting image files
-// increase this if you see timeouts or server errors
+// Which fields to include in frontmatter. Look in /src/frontmatter to see available fields.
+// Order is preserved. If a field has an empty value, it will not be included. You can rename a
+// field by providing an alias after a ':'. For example, 'date:created' will include 'date' in
+// frontmatter, but renamed to 'created'.
+exports.frontmatter_fields = [
+	'title',
+	'date',
+	'categories',
+	'tags',
+	'coverImage'
+];
+
+// Time in ms to wait between requesting image files. Increase this if you see timeouts or
+// server errors.
 exports.image_file_request_delay = 500;

-// time in ms to wait between saving Markdown files
-// increase this if your file system becomes overloaded
+// Time in ms to wait between saving Markdown files. Increase this if your file system becomes
+// overloaded.
 exports.markdown_file_write_delay = 25;

-// enable this to include time with post dates
-// for example, "2020-12-25" would become "2020-12-25T11:20:35.000Z"
+// Enable this to include time with post dates. For example, "2020-12-25" would become
+// "2020-12-25T11:20:35.000Z".
 exports.include_time_with_date = false;

-// override post date formatting with a custom formatting string (for example: 'yyyy LLL dd')
-// tokens are documented here: https://moment.github.io/luxon/docs/manual/formatting.html#table-of-tokens
-// if set, this takes precedence over include_time_with_date
+// Override post date formatting with a custom formatting string (for example: 'yyyy LLL dd').
+// Tokens are documented here: https://moment.github.io/luxon/#/parsing?id=table-of-tokens. If
+// set, this takes precedence over include_time_with_date.
 exports.custom_date_formatting = '';

-// categories to be excluded from post frontmatter
-// this does not filter out posts themselves, just the categories listed in their frontmatter
+// Categories to be excluded from post frontmatter. This does not filter out posts themselves,
+// just the categories listed in their frontmatter.
 exports.filter_categories = ['uncategorized'];

 // override date timezone
@@ -44,24 +44,62 @@ function initTurndownService() {
 		}
 	});

-	// preserve iframes (common for embedded audio/video)
+	// iframe boolean attributes do not need to be set to empty string
 	turndownService.addRule('iframe', {
 		filter: 'iframe',
 		replacement: (content, node) => {
-			const html = node.outerHTML.replace('allowfullscreen=""', 'allowfullscreen');
+			const html = node.outerHTML
+				.replace('allowfullscreen=""', 'allowfullscreen')
+				.replace('allowpaymentrequest=""', 'allowpaymentrequest');
 			return '\n\n' + html + '\n\n';
 		}
 	});

+	// preserve <figure> when it contains a <figcaption>
+	turndownService.addRule('figure', {
+		filter: 'figure',
+		replacement: (content, node) => {
+			if (node.querySelector('figcaption')) {
+				// extra newlines are necessary for markdown and HTML to render correctly together
+				const result = '\n\n<figure>\n\n' + content + '\n\n</figure>\n\n';
+				return result.replace('\n\n\n\n', '\n\n'); // collapse quadruple newlines
+			} else {
+				// does not contain <figcaption>, do not preserve
+				return content;
+			}
+		}
+	});
+
+	// preserve <figcaption>
+	turndownService.addRule('figcaption', {
+		filter: 'figcaption',
+		replacement: (content, node) => {
+			// extra newlines are necessary for markdown and HTML to render correctly together
+			return '\n\n<figcaption>\n\n' + content + '\n\n</figcaption>\n\n';
+		}
+	});
+
+	// convert <pre> into a code block with language when appropriate
+	turndownService.addRule('pre', {
+		filter: node => {
+			// a <pre> with <code> inside will already render nicely, so don't interfere
+			return node.nodeName === 'PRE' && !node.querySelector('code');
+		},
+		replacement: (content, node) => {
+			const language = node.getAttribute('data-wetm-language') || '';
+			return '\n\n```' + language + '\n' + node.textContent + '\n```\n\n';
+		}
+	});
+
 	return turndownService;
 }

-function getPostContent(post, turndownService, config) {
-	let content = post.encoded[0];
+function getPostContent(postData, turndownService, config) {
+	let content = postData.encoded[0];

 	// insert an empty div element between double line breaks
 	// this nifty trick causes turndown to keep adjacent paragraphs separated
-	// without mucking up content inside of other elemnts (like <code> blocks)
+	// without mucking up content inside of other elements (like <code> blocks)
 	content = content.replace(/(\r?\n){2}/g, '\n<div></div>\n');

 	if (config.saveScrapedImages) {
@@ -70,10 +108,13 @@ function getPostContent(post, turndownService, config) {
 		content = content.replace(/(<img[^>]*src=").*?([^/"]+\.(?:gif|jpe?g|png))("[^>]*>)/gi, '$1images/$2$3');
 	}

-	// this is a hack to make <iframe> nodes non-empty by inserting a "." which
-	// allows the iframe rule declared in initTurndownService() to take effect
-	// (using turndown's blankRule() and keep() solution did not work for me)
-	content = content.replace(/(<\/iframe>)/gi, '.$1');
+	// preserve "more" separator, max one per post, optionally with custom label
+	// by escaping angle brackets (will be unescaped during turndown conversion)
+	content = content.replace(/<(!--more( .*)?--)>/, '&lt;$1&gt;');
+
+	// some WordPress plugins specify a code language in an HTML comment above a
+	// <pre> block, save it to a data attribute so the "pre" rule can use it
+	content = content.replace(/(<!-- wp:.+? \{"language":"(.+?)"\} -->\r?\n<pre )/g, '$1data-wetm-language="$2" ');

 	// use turndown to convert HTML to Markdown
 	content = turndownService.turndown(content);
@@ -81,9 +122,6 @@ function getPostContent(post, turndownService, config) {
 	// clean up extra spaces in list items
 	content = content.replace(/(-|\d+\.) +/g, '$1 ');

-	// clean up the "." from the iframe hack above
-	content = content.replace(/\.(<\/iframe>)/gi, '$1');
-
 	return content;
 }

@@ -26,7 +26,7 @@ async function processPayloadsPromise(payloads, loadFunc) {
 			}
 		}, payload.delay);
 	}));
-	
+
 	const results = await Promise.allSettled(promises);
 	const failedCount = results.filter(result => result.status === 'rejected').length;
 	if (failedCount === 0) {
@@ -85,7 +85,9 @@ async function loadMarkdownFilePromise(post) {
 		} else {
 			// single string value
 			const escapedValue = (value || '').replace(/"/g, '\\"');
-			outputValue = `"${escapedValue}"`;
+			if (escapedValue.length > 0) {
+				outputValue = `"${escapedValue}"`;
+			}
 		}

 		if (outputValue !== undefined) {
@@ -157,7 +159,12 @@ async function loadImageFilePromise(imageUrl) {
 }

 function getPostPath(post, config) {
-	const dt = luxon.DateTime.fromISO(post.frontmatter.date);
+	let dt;
+	if (settings.custom_date_formatting) {
+		dt = luxon.DateTime.fromFormat(post.frontmatter.date, settings.custom_date_formatting);
+	} else {
+		dt = luxon.DateTime.fromISO(post.frontmatter.date);
+	}

 	// start with base output dir
 	const pathSegments = [config.output];