Refactor for post data and frontmatter

This commit is contained in:
Will Boyd
2024-02-23 10:24:40 -05:00
parent 5db0a1b081
commit 1ad4e2dfdf
2 changed files with 63 additions and 47 deletions
+61 -45
View File
@@ -9,32 +9,34 @@ const translator = require('./translator');
async function parseFilePromise(config) { async function parseFilePromise(config) {
console.log('\nParsing...'); console.log('\nParsing...');
const content = await fs.promises.readFile(config.input, 'utf8'); const content = await fs.promises.readFile(config.input, 'utf8');
const data = await xml2js.parseStringPromise(content, { const allData = await xml2js.parseStringPromise(content, {
trim: true, trim: true,
tagNameProcessors: [xml2js.processors.stripPrefix] tagNameProcessors: [xml2js.processors.stripPrefix]
}); });
const channelData = allData.rss.channel[0].item;
const postTypes = getPostTypes(data, config); const postTypes = getPostTypes(channelData, config);
const posts = collectPosts(data, postTypes, config); const posts = collectPosts(channelData, postTypes, config);
const images = []; const images = [];
if (config.saveAttachedImages) { if (config.saveAttachedImages) {
images.push(...collectAttachedImages(data)); images.push(...collectAttachedImages(channelData));
} }
if (config.saveScrapedImages) { if (config.saveScrapedImages) {
images.push(...collectScrapedImages(data, postTypes)); images.push(...collectScrapedImages(channelData, postTypes));
} }
mergeImagesIntoPosts(images, posts); mergeImagesIntoPosts(images, posts);
populateFrontmatter(posts);
return posts; return posts;
} }
function getPostTypes(data, config) { function getPostTypes(channelData, config) {
if (config.includeOtherTypes) { if (config.includeOtherTypes) {
// search export file for all post types minus some default types we don't want // search export file for all post types minus some default types we don't want
// effectively this will be 'post', 'page', and custom post types // effectively this will be 'post', 'page', and custom post types
const types = data.rss.channel[0].item const types = channelData
.map(item => item.post_type[0]) .map(item => item.post_type[0])
.filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type)); .filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type));
return [...new Set(types)]; // remove duplicates return [...new Set(types)]; // remove duplicates
@@ -44,34 +46,31 @@ function getPostTypes(data, config) {
} }
} }
function getItemsOfType(data, type) { function getItemsOfType(channelData, type) {
return data.rss.channel[0].item.filter(item => item.post_type[0] === type); return channelData.filter(item => item.post_type[0] === type);
} }
function collectPosts(data, postTypes, config) { function collectPosts(channelData, postTypes, config) {
// this is passed into getPostContent() for the markdown conversion // this is passed into getPostContent() for the markdown conversion
const turndownService = translator.initTurndownService(); const turndownService = translator.initTurndownService();
let allPosts = []; let allPosts = [];
postTypes.forEach(postType => { postTypes.forEach(postType => {
const postsForType = getItemsOfType(data, postType) const postsForType = getItemsOfType(channelData, postType)
.filter(post => post.status[0] !== 'trash' && post.status[0] !== 'draft') .filter(postData => postData.status[0] !== 'trash' && postData.status[0] !== 'draft')
.map(post => ({ .map(postData => ({
data: postData,
// meta data isn't written to file, but is used to help with other things // meta data isn't written to file, but is used to help with other things
meta: { meta: {
id: getPostId(post), id: getPostId(postData),
slug: getPostSlug(post), slug: getPostSlug(postData),
coverImageId: getPostCoverImageId(post), coverImageId: getPostCoverImageId(postData),
coverImage: undefined, // possibly set later in mergeImagesIntoPosts()
type: postType, type: postType,
imageUrls: [] imageUrls: []
}, },
frontmatter: { content: translator.getPostContent(postData, turndownService, config)
title: getPostTitle(post),
date: getPostDate(post),
categories: getCategories(post),
tags: getTags(post)
},
content: translator.getPostContent(post, turndownService, config)
})); }));
if (postTypes.length > 1) { if (postTypes.length > 1) {
@@ -87,30 +86,30 @@ function collectPosts(data, postTypes, config) {
return allPosts; return allPosts;
} }
function getPostId(post) { function getPostId(postData) {
return post.post_id[0]; return postData.post_id[0];
} }
function getPostSlug(post) { function getPostSlug(postData) {
return decodeURIComponent(post.post_name[0]); return decodeURIComponent(postData.post_name[0]);
} }
function getPostCoverImageId(post) { function getPostCoverImageId(postData) {
if (post.postmeta === undefined) { if (postData.postmeta === undefined) {
return undefined; return undefined;
} }
const postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id'); const postmeta = postData.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
const id = postmeta ? postmeta.meta_value[0] : undefined; const id = postmeta ? postmeta.meta_value[0] : undefined;
return id; return id;
} }
function getPostTitle(post) { function getPostTitle(post) {
return post.title[0]; return post.data.title[0];
} }
function getPostDate(post) { function getPostDate(post) {
const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }); const dateTime = luxon.DateTime.fromRFC2822(post.data.pubDate[0], { zone: 'utc' });
if (settings.custom_date_formatting) { if (settings.custom_date_formatting) {
return dateTime.toFormat(settings.custom_date_formatting); return dateTime.toFormat(settings.custom_date_formatting);
@@ -122,26 +121,30 @@ function getPostDate(post) {
} }
function getCategories(post) { function getCategories(post) {
const categories = processCategoryTags(post, 'category'); const categories = processCategoryTags(post.data, 'category');
return categories.filter(category => !settings.filter_categories.includes(category)); return categories.filter(category => !settings.filter_categories.includes(category));
} }
function getTags(post) { function getTags(post) {
return processCategoryTags(post, 'post_tag'); return processCategoryTags(post.data, 'post_tag');
} }
function processCategoryTags(post, domain) { function getCoverImage(post) {
if (!post.category) { return post.meta.coverImage;
}
function processCategoryTags(postData, domain) {
if (!postData.category) {
return []; return [];
} }
return post.category return postData.category
.filter(category => category.$.domain === domain) .filter(category => category.$.domain === domain)
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename)); .map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
} }
function collectAttachedImages(data) { function collectAttachedImages(channelData) {
const images = getItemsOfType(data, 'attachment') const images = getItemsOfType(channelData, 'attachment')
// filter to certain image file types // filter to certain image file types
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0])) .filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({ .map(attachment => ({
@@ -154,13 +157,13 @@ function collectAttachedImages(data) {
return images; return images;
} }
function collectScrapedImages(data, postTypes) { function collectScrapedImages(channelData, postTypes) {
const images = []; const images = [];
postTypes.forEach(postType => { postTypes.forEach(postType => {
getItemsOfType(data, postType).forEach(post => { getItemsOfType(channelData, postType).forEach(postData => {
const postId = post.post_id[0]; const postId = postData.post_id[0];
const postContent = post.encoded[0]; const postContent = postData.encoded[0];
const postLink = post.link[0]; const postLink = postData.link[0];
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)]; const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
matches.forEach(match => { matches.forEach(match => {
@@ -192,7 +195,7 @@ function mergeImagesIntoPosts(images, posts) {
// this image was set as the featured image for this post // this image was set as the featured image for this post
if (image.id === post.meta.coverImageId) { if (image.id === post.meta.coverImageId) {
shouldAttach = true; shouldAttach = true;
post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url); post.meta.coverImage = shared.getFilenameFromUrl(image.url);
} }
if (shouldAttach && !post.meta.imageUrls.includes(image.url)) { if (shouldAttach && !post.meta.imageUrls.includes(image.url)) {
@@ -202,4 +205,17 @@ function mergeImagesIntoPosts(images, posts) {
}); });
} }
function populateFrontmatter(posts) {
posts.forEach(post => {
console.log(post);
post.frontmatter = {
title: getPostTitle(post),
date: getPostDate(post),
categories: getCategories(post),
tags: getTags(post),
coverImage: getCoverImage(post)
}
});
}
exports.parseFilePromise = parseFilePromise; exports.parseFilePromise = parseFilePromise;
+2 -2
View File
@@ -94,8 +94,8 @@ function initTurndownService() {
return turndownService; return turndownService;
} }
function getPostContent(post, turndownService, config) { function getPostContent(postData, turndownService, config) {
let content = post.encoded[0]; let content = postData.encoded[0];
// insert an empty div element between double line breaks // insert an empty div element between double line breaks
// this nifty trick causes turndown to keep adjacent paragraphs separated // this nifty trick causes turndown to keep adjacent paragraphs separated