Refactor for post data and frontmatter

2026-06-05 15:09:59 +09:00 · 2024-02-23 10:24:40 -05:00
parent 5db0a1b081
commit 1ad4e2dfdf
2 changed files with 63 additions and 47 deletions
@@ -9,32 +9,34 @@ const translator = require('./translator');
 async function parseFilePromise(config) {
 	console.log('\nParsing...');
 	const content = await fs.promises.readFile(config.input, 'utf8');
-	const data = await xml2js.parseStringPromise(content, {
+	const allData = await xml2js.parseStringPromise(content, {
 		trim: true,
 		tagNameProcessors: [xml2js.processors.stripPrefix]
 	});
+	const channelData = allData.rss.channel[0].item;

-	const postTypes = getPostTypes(data, config);
-	const posts = collectPosts(data, postTypes, config);
+	const postTypes = getPostTypes(channelData, config);
+	const posts = collectPosts(channelData, postTypes, config);

 	const images = [];
 	if (config.saveAttachedImages) {
-		images.push(...collectAttachedImages(data));
+		images.push(...collectAttachedImages(channelData));
 	}
 	if (config.saveScrapedImages) {
-		images.push(...collectScrapedImages(data, postTypes));
+		images.push(...collectScrapedImages(channelData, postTypes));
 	}

 	mergeImagesIntoPosts(images, posts);
+	populateFrontmatter(posts);

 	return posts;
 }

-function getPostTypes(data, config) {
+function getPostTypes(channelData, config) {
 	if (config.includeOtherTypes) {
 		// search export file for all post types minus some default types we don't want
 		// effectively this will be 'post', 'page', and custom post types
-		const types = data.rss.channel[0].item
+		const types = channelData
 			.map(item => item.post_type[0])
 			.filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type));
 		return [...new Set(types)]; // remove duplicates
@@ -44,34 +46,31 @@ function getPostTypes(data, config) {
 	}
 }

-function getItemsOfType(data, type) {
-	return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
+function getItemsOfType(channelData, type) {
+	return channelData.filter(item => item.post_type[0] === type);
 }

-function collectPosts(data, postTypes, config) {
+function collectPosts(channelData, postTypes, config) {
 	// this is passed into getPostContent() for the markdown conversion
 	const turndownService = translator.initTurndownService();

 	let allPosts = [];
 	postTypes.forEach(postType => {
-		const postsForType = getItemsOfType(data, postType)
-			.filter(post => post.status[0] !== 'trash' && post.status[0] !== 'draft')
-			.map(post => ({
+		const postsForType = getItemsOfType(channelData, postType)
+			.filter(postData => postData.status[0] !== 'trash' && postData.status[0] !== 'draft')
+			.map(postData => ({
+				data: postData,
+
 				// meta data isn't written to file, but is used to help with other things
 				meta: {
-					id: getPostId(post),
-					slug: getPostSlug(post),
-					coverImageId: getPostCoverImageId(post),
+					id: getPostId(postData),
+					slug: getPostSlug(postData),
+					coverImageId: getPostCoverImageId(postData),
+					coverImage: undefined, // possibly set later in mergeImagesIntoPosts()
 					type: postType,
 					imageUrls: []
 				},
-				frontmatter: {
-					title: getPostTitle(post),
-					date: getPostDate(post),
-					categories: getCategories(post),
-					tags: getTags(post)
-				},
-				content: translator.getPostContent(post, turndownService, config)
+				content: translator.getPostContent(postData, turndownService, config)
 			}));

 		if (postTypes.length > 1) {
@@ -87,30 +86,30 @@ function collectPosts(data, postTypes, config) {
 	return allPosts;
 }

-function getPostId(post) {
-	return post.post_id[0];
+function getPostId(postData) {
+	return postData.post_id[0];
 }

-function getPostSlug(post) {
-	return decodeURIComponent(post.post_name[0]);
+function getPostSlug(postData) {
+	return decodeURIComponent(postData.post_name[0]);
 }

-function getPostCoverImageId(post) {
-	if (post.postmeta === undefined) {
+function getPostCoverImageId(postData) {
+	if (postData.postmeta === undefined) {
 		return undefined;
 	}

-	const postmeta = post.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
+	const postmeta = postData.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
 	const id = postmeta ? postmeta.meta_value[0] : undefined;
 	return id;
 }

 function getPostTitle(post) {
-	return post.title[0];
+	return post.data.title[0];
 }

 function getPostDate(post) {
-	const dateTime = luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' });
+	const dateTime = luxon.DateTime.fromRFC2822(post.data.pubDate[0], { zone: 'utc' });

 	if (settings.custom_date_formatting) {
 		return dateTime.toFormat(settings.custom_date_formatting);
@@ -122,26 +121,30 @@ function getPostDate(post) {
 }

 function getCategories(post) {
-	const categories = processCategoryTags(post, 'category');
+	const categories = processCategoryTags(post.data, 'category');
 	return categories.filter(category => !settings.filter_categories.includes(category));
 }

 function getTags(post) {
-	return processCategoryTags(post, 'post_tag');
+	return processCategoryTags(post.data, 'post_tag');
 }

-function processCategoryTags(post, domain) {
-	if (!post.category) {
+function getCoverImage(post) {
+	return post.meta.coverImage;
+}
+
+function processCategoryTags(postData, domain) {
+	if (!postData.category) {
 		return [];
 	}

-	return post.category
+	return postData.category
 		.filter(category => category.$.domain === domain)
 		.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
 }

-function collectAttachedImages(data) {
-	const images = getItemsOfType(data, 'attachment')
+function collectAttachedImages(channelData) {
+	const images = getItemsOfType(channelData, 'attachment')
 		// filter to certain image file types
 		.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
 		.map(attachment => ({
@@ -154,13 +157,13 @@ function collectAttachedImages(data) {
 	return images;
 }

-function collectScrapedImages(data, postTypes) {
+function collectScrapedImages(channelData, postTypes) {
 	const images = [];
 	postTypes.forEach(postType => {
-		getItemsOfType(data, postType).forEach(post => {
-			const postId = post.post_id[0];
-			const postContent = post.encoded[0];
-			const postLink = post.link[0];
+		getItemsOfType(channelData, postType).forEach(postData => {
+			const postId = postData.post_id[0];
+			const postContent = postData.encoded[0];
+			const postLink = postData.link[0];

 			const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
 			matches.forEach(match => {
@@ -192,7 +195,7 @@ function mergeImagesIntoPosts(images, posts) {
 			// this image was set as the featured image for this post
 			if (image.id === post.meta.coverImageId) {
 				shouldAttach = true;
-				post.frontmatter.coverImage = shared.getFilenameFromUrl(image.url);
+				post.meta.coverImage = shared.getFilenameFromUrl(image.url);
 			}

 			if (shouldAttach && !post.meta.imageUrls.includes(image.url)) {
@@ -202,4 +205,17 @@ function mergeImagesIntoPosts(images, posts) {
 	});
 }

+function populateFrontmatter(posts) {
+	posts.forEach(post => {
+		console.log(post);
+		post.frontmatter = {
+			title: getPostTitle(post),
+			date: getPostDate(post),
+			categories: getCategories(post),
+			tags: getTags(post),
+			coverImage: getCoverImage(post)
+		}
+	});
+}
+
 exports.parseFilePromise = parseFilePromise;
@@ -94,8 +94,8 @@ function initTurndownService() {
 	return turndownService;
 }

-function getPostContent(post, turndownService, config) {
-	let content = post.encoded[0];
+function getPostContent(postData, turndownService, config) {
+	let content = postData.encoded[0];

 	// insert an empty div element between double line breaks
 	// this nifty trick causes turndown to keep adjacent paragraphs separated