Merge pull request #110 from lonekorean/customizable-frontmatter

Customizable frontmatter
This commit is contained in:
Will Boyd
2024-02-24 14:56:58 -05:00
committed by GitHub
14 changed files with 194 additions and 90 deletions
+23
View File
@@ -0,0 +1,23 @@
# How to Contribute
Contributions are welcome! Thank you!
## General Guidelines
Some quick notes when making a pull request.
- Match the style and formatting of the code you are editing.
- Each pull request should be focused on a single thing (a single bug fix, a single feature, etc.). This makes reviewing easier and minimizes merge conflicts.
- Include a description of the problem being solved and what your code does. Steps to reproduce the problem or example input/output are very helpful.
## Adding Options
Keeping the wizard as short as possible is a priority. Pull requests that add options to the wizard will probably not be accepted. Instead, you can add an advanced setting to [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js).
## Adding Frontmatter Fields
Similarly, default frontmatter output is limited to just a few widely used fields to avoid bloat. However, you may add new optional frontmatter fields.
To do so, follow the instructions in [/src/frontmatter/example.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/frontmatter/example.js).
Users will be able to include your new frontmatter field by editing `frontmatter_fields` in [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js).
+1 -1
View File
@@ -165,6 +165,6 @@ Some WordPress sites make use of a `"page"` post type and/or custom post types.
## Advanced Settings
You can edit [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js) to tweak advanced settings. This includes things like throttling image downloads or customizing the date format in frontmatter.
You can edit [settings.js](https://github.com/lonekorean/wordpress-export-to-markdown/blob/master/src/settings.js) to tweak advanced settings. This includes things like customizing frontmatter fields and throttling image downloads.
You'll need to run the script locally (not using `npx`) to make use of advanced settings.
+9
View File
@@ -17,6 +17,7 @@
"luxon": "^3.4.4",
"request": "^2.88.2",
"request-promise-native": "^1.0.8",
"require-directory": "^2.1.1",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"xml2js": "^0.6.2"
@@ -1545,6 +1546,14 @@
"request": "^2.34"
}
},
"node_modules/require-directory": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
"integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/resolve-from": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+1
View File
@@ -28,6 +28,7 @@
"luxon": "^3.4.4",
"request": "^2.88.2",
"request-promise-native": "^1.0.8",
"require-directory": "^2.1.1",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"xml2js": "^0.6.2"
+14
View File
@@ -0,0 +1,14 @@
const settings = require('../settings');
// get array of categories for post, filtered as specified in settings
module.exports = (post) => {
if (!post.data.category) {
return [];
}
const categories = post.data.category
.filter(category => category.$.domain === 'category')
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
return categories.filter(category => !settings.filter_categories.includes(category));
};
+5
View File
@@ -0,0 +1,5 @@
// get cover image filename, previously set on post.meta
// this one is unique as it relies on logic executed by the parser
module.exports = (post) => {
return post.meta.coverImage;
};
+16
View File
@@ -0,0 +1,16 @@
const luxon = require('luxon');
const settings = require('../settings');
// get post date, optionally formatted as specified in settings
module.exports = (post) => {
const dateTime = luxon.DateTime.fromRFC2822(post.data.pubDate[0], { zone: 'utc' });
if (settings.custom_date_formatting) {
return dateTime.toFormat(settings.custom_date_formatting);
} else if (settings.include_time_with_date) {
return dateTime.toISO();
} else {
return dateTime.toISODate();
}
};
+19
View File
@@ -0,0 +1,19 @@
/*
1. Copy this file, rename to the frontmatter field name you want, camelcased
2. Edit frontmatter_fields in settings.js to include your new field name
3. Run the script to see post data dumps, to see what you can work with
4. Write your code to get and return what you want
5. Update "get whatever" comment to describe what you're getting
6. Remove your field name from frontmatter_fields in settings.js
7. Remove this comment block and the debug console code
8. Make that pull request!
*/
// get whatever
module.exports = (post) => {
console.log('\nBEGIN POST DATA DUMP ===========================================================\n');
console.dir(post, { depth: null });
console.log('\nEND POST DATA DUMP =============================================================\n');
return 'EXAMPLE: ' + post.data.title[0];
};
+12
View File
@@ -0,0 +1,12 @@
// get array of tags for post
module.exports = (post) => {
if (!post.data.category) {
return [];
}
const categories = post.data.category
.filter(category => category.$.domain === 'post_tag')
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
return categories;
};
+4
View File
@@ -0,0 +1,4 @@
// get simple post title
module.exports = (post) => {
return post.data.title[0];
};
+62 -75
View File
@@ -1,40 +1,45 @@
const fs = require('fs');
const luxon = require('luxon');
const requireDirectory = require('require-directory');
const xml2js = require('xml2js');
const shared = require('./shared');
const settings = require('./settings');
const translator = require('./translator');
// dynamically requires all frontmatter getters
const frontmatterGetters = requireDirectory(module, './frontmatter', { recurse: false });
async function parseFilePromise(config) {
console.log('\nParsing...');
const content = await fs.promises.readFile(config.input, 'utf8');
const data = await xml2js.parseStringPromise(content, {
const allData = await xml2js.parseStringPromise(content, {
trim: true,
tagNameProcessors: [xml2js.processors.stripPrefix]
});
const channelData = allData.rss.channel[0].item;
const postTypes = getPostTypes(data, config);
const posts = collectPosts(data, postTypes, config);
const postTypes = getPostTypes(channelData, config);
const posts = collectPosts(channelData, postTypes, config);
const images = [];
if (config.saveAttachedImages) {
images.push(...collectAttachedImages(data));
images.push(...collectAttachedImages(channelData));
}
if (config.saveScrapedImages) {
images.push(...collectScrapedImages(data, postTypes));
images.push(...collectScrapedImages(channelData, postTypes));
}
mergeImagesIntoPosts(images, posts);
populateFrontmatter(posts);
return posts;
}
function getPostTypes(data, config) {
function getPostTypes(channelData, config) {
if (config.includeOtherTypes) {
// search export file for all post types minus some default types we don't want
// effectively this will be 'post', 'page', and custom post types
const types = data.rss.channel[0].item
const types = channelData
.map(item => item.post_type[0])
.filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type));
return [...new Set(types)]; // remove duplicates
@@ -44,34 +49,34 @@ function getPostTypes(data, config) {
}
}
function getItemsOfType(data, type) {
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
function getItemsOfType(channelData, type) {
return channelData.filter(item => item.post_type[0] === type);
}
function collectPosts(data, postTypes, config) {
function collectPosts(channelData, postTypes, config) {
// this is passed into getPostContent() for the markdown conversion
const turndownService = translator.initTurndownService();
let allPosts = [];
postTypes.forEach(postType => {
const postsForType = getItemsOfType(data, postType)
.filter(post => post.status[0] !== 'trash' && post.status[0] !== 'draft')
.map(post => ({
const postsForType = getItemsOfType(channelData, postType)
.filter(postData => postData.status[0] !== 'trash' && postData.status[0] !== 'draft')
.map(postData => ({
// raw post data, used by frontmatter getters
data: postData,
// meta data isn't written to file, but is used to help with other things
meta: {
id: getPostId(post),
slug: getPostSlug(post),
coverImageId: getPostCoverImageId(post),
id: getPostId(postData),
slug: getPostSlug(postData),
coverImageId: getPostCoverImageId(postData),
coverImage: undefined, // possibly set later in mergeImagesIntoPosts()
type: postType,
imageUrls: []
imageUrls: [] // possibly set later in mergeImagesIntoPosts()
},
frontmatter: {
title: getPostTitle(post),
date: getPostDate(post),
categories: getCategories(post),
tags: getTags(post)
},
content: translator.getPostContent(post, turndownService, config)
// contents of the post in markdown
content: translator.getPostContent(postData, turndownService, config)
}));
if (postTypes.length > 1) {
@@ -87,61 +92,26 @@ function collectPosts(data, postTypes, config) {
return allPosts;
}
function getPostId(post) {
return post.post_id[0];
function getPostId(postData) {
return postData.post_id[0];
}
function getPostSlug(post) {
return decodeURIComponent(post.post_name[0]);
function getPostSlug(postData) {
return decodeURIComponent(postData.post_name[0]);
}
function getPostCoverImageId(post) {
if (post.postmeta === undefined) {
function getPostCoverImageId(postData) {
if (postData.postmeta === undefined) {
return undefined;
}
const postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
const postmeta = postData.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
const id = postmeta ? postmeta.meta_value[0] : undefined;
return id;
}
function getPostTitle(post) {
return post.title[0];
}
function getPostDate(post) {
const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' });
if (settings.custom_date_formatting) {
return dateTime.toFormat(settings.custom_date_formatting);
} else if (settings.include_time_with_date) {
return dateTime.toISO();
} else {
return dateTime.toISODate();
}
}
function getCategories(post) {
const categories = processCategoryTags(post, 'category');
return categories.filter(category => !settings.filter_categories.includes(category));
}
function getTags(post) {
return processCategoryTags(post, 'post_tag');
}
function processCategoryTags(post, domain) {
if (!post.category) {
return [];
}
return post.category
.filter(category => category.$.domain === domain)
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
}
function collectAttachedImages(data) {
const images = getItemsOfType(data, 'attachment')
function collectAttachedImages(channelData) {
const images = getItemsOfType(channelData, 'attachment')
// filter to certain image file types
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
@@ -154,13 +124,13 @@ function collectAttachedImages(data) {
return images;
}
function collectScrapedImages(data, postTypes) {
function collectScrapedImages(channelData, postTypes) {
const images = [];
postTypes.forEach(postType => {
getItemsOfType(data, postType).forEach(post => {
const postId = post.post_id[0];
const postContent = post.encoded[0];
const postLink = post.link[0];
getItemsOfType(channelData, postType).forEach(postData => {
const postId = postData.post_id[0];
const postContent = postData.encoded[0];
const postLink = postData.link[0];
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
matches.forEach(match => {
@@ -192,7 +162,7 @@ function mergeImagesIntoPosts(images, posts) {
// this image was set as the featured image for this post
if (image.id === post.meta.coverImageId) {
shouldAttach = true;
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
post.meta.coverImage = shared.getFilenameFromUrl(image.url);
}
if (shouldAttach && !post.meta.imageUrls.includes(image.url)) {
@@ -202,4 +172,21 @@ function mergeImagesIntoPosts(images, posts) {
});
}
function populateFrontmatter(posts) {
posts.forEach(post => {
const frontmatter = {};
settings.frontmatter_fields.forEach(field => {
[key, alias] = field.split(':');
let frontmatterGetter = frontmatterGetters[key];
if (!frontmatterGetter) {
throw `Could not find a frontmatter getter named "${key}".`;
}
frontmatter[alias || key] = frontmatterGetter(post);
});
post.frontmatter = frontmatter;
});
}
exports.parseFilePromise = parseFilePromise;
+23 -11
View File
@@ -1,20 +1,32 @@
// time in ms to wait between requesting image files
// increase this if you see timeouts or server errors
// Which fields to include in frontmatter. Look in /src/frontmatter to see available fields.
// Order is preserved. If a field has an empty value, it will not be included. You can rename a
// field by providing an alias after a ':'. For example, 'date:created' will include 'date' in
// frontmatter, but renamed to 'created'.
exports.frontmatter_fields = [
'title',
'date',
'categories',
'tags',
'coverImage'
];
// Time in ms to wait between requesting image files. Increase this if you see timeouts or
// server errors.
exports.image_file_request_delay = 500;
// time in ms to wait between saving Markdown files
// increase this if your file system becomes overloaded
// Time in ms to wait between saving Markdown files. Increase this if your file system becomes
// overloaded.
exports.markdown_file_write_delay = 25;
// enable this to include time with post dates
// for example, "2020-12-25" would become "2020-12-25T11:20:35.000Z"
// Enable this to include time with post dates. For example, "2020-12-25" would become
// "2020-12-25T11:20:35.000Z".
exports.include_time_with_date = false;
// override post date formatting with a custom formatting string (for example: 'yyyy LLL dd')
// tokens are documented here: https://moment.github.io/luxon/#/parsing?id=table-of-tokens
// if set, this takes precedence over include_time_with_date
// Override post date formatting with a custom formatting string (for example: 'yyyy LLL dd').
// Tokens are documented here: https://moment.github.io/luxon/#/parsing?id=table-of-tokens. If
// set, this takes precedence over include_time_with_date.
exports.custom_date_formatting = '';
// categories to be excluded from post frontmatter
// this does not filter out posts themselves, just the categories listed in their frontmatter
// Categories to be excluded from post frontmatter. This does not filter out posts themselves,
// just the categories listed in their frontmatter.
exports.filter_categories = ['uncategorized'];
+2 -2
View File
@@ -94,8 +94,8 @@ function initTurndownService() {
return turndownService;
}
function getPostContent(post, turndownService, config) {
let content = post.encoded[0];
function getPostContent(postData, turndownService, config) {
let content = postData.encoded[0];
// insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated
+3 -1
View File
@@ -85,7 +85,9 @@ async function loadMarkdownFilePromise(post) {
} else {
// single string value
const escapedValue = (value || '').replace(/"/g, '\\"');
outputValue = `"${escapedValue}"`;
if (escapedValue.length > 0) {
outputValue = `"${escapedValue}"`;
}
}
if (outputValue !== undefined) {