Files
wordpress-export-to-markdown/src/parser.js
T

188 lines
5.2 KiB
JavaScript
Raw Normal View History

const fs = require('fs');
const xml2js = require('xml2js');
const shared = require('./shared');
2019-12-17 13:52:09 -05:00
const translator = require('./translator');
2024-02-23 12:53:58 -05:00
const frontmatter = {
title: require('./frontmatter/title'),
date: require('./frontmatter/date'),
categories: require('./frontmatter/categories'),
tags: require('./frontmatter/tags'),
coverImage: require('./frontmatter/coverImage'),
};
2019-12-17 14:03:58 -05:00
async function parseFilePromise(config) {
2020-01-14 10:26:50 -05:00
console.log('\nParsing...');
2019-12-21 15:57:25 -05:00
const content = await fs.promises.readFile(config.input, 'utf8');
2024-02-23 10:24:40 -05:00
const allData = await xml2js.parseStringPromise(content, {
2019-12-18 16:36:43 -05:00
trim: true,
tagNameProcessors: [xml2js.processors.stripPrefix]
});
2024-02-23 10:24:40 -05:00
const channelData = allData.rss.channel[0].item;
2024-02-23 10:24:40 -05:00
const postTypes = getPostTypes(channelData, config);
const posts = collectPosts(channelData, postTypes, config);
2019-12-19 13:17:43 -05:00
const images = [];
2020-01-12 09:03:32 -05:00
if (config.saveAttachedImages) {
2024-02-23 10:24:40 -05:00
images.push(...collectAttachedImages(channelData));
2019-12-19 12:35:33 -05:00
}
2020-01-12 09:03:32 -05:00
if (config.saveScrapedImages) {
2024-02-23 10:24:40 -05:00
images.push(...collectScrapedImages(channelData, postTypes));
2019-12-19 12:35:33 -05:00
}
mergeImagesIntoPosts(images, posts);
2024-02-23 10:24:40 -05:00
populateFrontmatter(posts);
2019-12-17 14:03:58 -05:00
2019-12-21 15:57:25 -05:00
return posts;
}
2024-02-23 10:24:40 -05:00
function getPostTypes(channelData, config) {
if (config.includeOtherTypes) {
// search export file for all post types minus some default types we don't want
// effectively this will be 'post', 'page', and custom post types
2024-02-23 10:24:40 -05:00
const types = channelData
.map(item => item.post_type[0])
.filter(type => !['attachment', 'revision', 'nav_menu_item', 'custom_css', 'customize_changeset'].includes(type));
return [...new Set(types)]; // remove duplicates
} else {
// just plain old vanilla "post" posts
return ['post'];
}
}
2024-02-23 10:24:40 -05:00
function getItemsOfType(channelData, type) {
return channelData.filter(item => item.post_type[0] === type);
}
2024-02-23 10:24:40 -05:00
function collectPosts(channelData, postTypes, config) {
// this is passed into getPostContent() for the markdown conversion
const turndownService = translator.initTurndownService();
2020-12-26 13:18:49 -05:00
let allPosts = [];
postTypes.forEach(postType => {
2024-02-23 10:24:40 -05:00
const postsForType = getItemsOfType(channelData, postType)
.filter(postData => postData.status[0] !== 'trash' && postData.status[0] !== 'draft')
.map(postData => ({
data: postData,
2020-12-26 13:18:49 -05:00
// meta data isn't written to file, but is used to help with other things
meta: {
2024-02-23 10:24:40 -05:00
id: getPostId(postData),
slug: getPostSlug(postData),
coverImageId: getPostCoverImageId(postData),
coverImage: undefined, // possibly set later in mergeImagesIntoPosts()
2020-12-26 13:18:49 -05:00
type: postType,
imageUrls: []
},
2024-02-23 10:24:40 -05:00
content: translator.getPostContent(postData, turndownService, config)
2020-12-26 13:18:49 -05:00
}));
if (postTypes.length > 1) {
2020-12-26 13:18:49 -05:00
console.log(`${postsForType.length} "${postType}" posts found.`);
}
allPosts.push(...postsForType);
});
if (postTypes.length === 1) {
2020-12-26 13:18:49 -05:00
console.log(allPosts.length + ' posts found.');
}
return allPosts;
}
2024-02-23 10:24:40 -05:00
function getPostId(postData) {
return postData.post_id[0];
}
2024-02-23 10:24:40 -05:00
function getPostSlug(postData) {
return decodeURIComponent(postData.post_name[0]);
2020-01-12 13:50:50 -05:00
}
2024-02-23 10:24:40 -05:00
function getPostCoverImageId(postData) {
if (postData.postmeta === undefined) {
2020-01-12 13:50:50 -05:00
return undefined;
}
2024-02-23 10:24:40 -05:00
const postmeta = postData.postmeta.find(postmeta => postmeta.meta_key[0] === '_thumbnail_id');
const id = postmeta ? postmeta.meta_value[0] : undefined;
return id;
}
2024-02-23 10:24:40 -05:00
function collectAttachedImages(channelData) {
const images = getItemsOfType(channelData, 'attachment')
2019-12-19 13:17:43 -05:00
// filter to certain image file types
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
.map(attachment => ({
id: attachment.post_id[0],
postId: attachment.post_parent[0],
2020-12-29 16:41:38 -05:00
url: attachment.attachment_url[0]
2019-12-19 13:17:43 -05:00
}));
console.log(images.length + ' attached images found.');
return images;
}
2024-02-23 10:24:40 -05:00
function collectScrapedImages(channelData, postTypes) {
const images = [];
postTypes.forEach(postType => {
2024-02-23 10:24:40 -05:00
getItemsOfType(channelData, postType).forEach(postData => {
const postId = postData.post_id[0];
const postContent = postData.encoded[0];
const postLink = postData.link[0];
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
matches.forEach(match => {
// base the matched image URL relative to the post URL
const url = new URL(match[1], postLink).href;
images.push({
id: -1,
postId: postId,
url
});
2019-12-19 13:17:43 -05:00
});
});
});
console.log(images.length + ' images scraped from post body content.');
return images;
}
function mergeImagesIntoPosts(images, posts) {
images.forEach(image => {
posts.forEach(post => {
let shouldAttach = false;
// this image was uploaded as an attachment to this post
if (image.postId === post.meta.id) {
shouldAttach = true;
}
// this image was set as the featured image for this post
if (image.id === post.meta.coverImageId) {
shouldAttach = true;
2024-02-23 10:24:40 -05:00
post.meta.coverImage = shared.getFilenameFromUrl(image.url);
}
if (shouldAttach && !post.meta.imageUrls.includes(image.url)) {
2019-12-19 12:35:33 -05:00
post.meta.imageUrls.push(image.url);
}
});
});
}
2024-02-23 10:24:40 -05:00
function populateFrontmatter(posts) {
posts.forEach(post => {
post.frontmatter = {
2024-02-23 12:53:58 -05:00
title: frontmatter.title(post),
date: frontmatter.date(post),
categories: frontmatter.categories(post),
tags: frontmatter.tags(post),
coverImage: frontmatter.coverImage(post)
2024-02-23 10:24:40 -05:00
}
});
}
2019-12-17 13:52:09 -05:00
exports.parseFilePromise = parseFilePromise;