Merge pull request #149 from lonekorean/parse-debugging

Parse debugging
This commit is contained in:
Will Boyd
2025-02-25 17:12:14 -05:00
committed by GitHub
5 changed files with 211 additions and 87 deletions
+108
View File
@@ -0,0 +1,108 @@
import xml2js from 'xml2js';
class Data {
#obj;
#expression;
constructor(obj, expression) {
// xml2js returns leaf nodes as strings, turn those into consistent objects
// I found this to be safer and more efficient than using the explicitCharkey option
this.#obj = typeof obj === 'string' ? { _: obj } : obj;
// this identifies how the object was referenced, helps a ton with debugging
this.#expression = expression;
}
#buildExpression(propName, index = undefined) {
let expression = `${this.#expression}.${propName}`;
if (index !== undefined) {
expression += `[${index}]`;
}
return expression;
}
// used by "optional" functions to return undefined instead of throwing an error
#optional(func) {
try {
return func();
} catch (ex) {
return undefined;
}
}
// will not throw an error if property doesn't exist, defaults to empty array
children(propName) {
const nodes = this.#obj[propName] ?? [];
return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index)));
}
// throws an error if property (or index on property) doesn't exist
child(propName, index = 0) {
const nodes = this.#obj[propName];
if (nodes === undefined) {
throw new Error(`Could not find ${this.#buildExpression(propName)}.`);
}
const node = nodes[index];
if (node === undefined) {
throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`);
}
return new Data(node, this.#buildExpression(propName, index));
}
// convenience function, since it's very common to want the value of a child
childValue(propName, index = 0) {
return this.child(propName, index).value();
}
// throws an error if this object doesn't have a value string
value() {
const value = this.#obj._;
if (value === undefined) {
throw new Error(`Could not get value from ${this.#expression}.`);
}
return value;
}
// throws an error if attribute does not exist
attribute(attrName) {
const attribute = this.#obj.$?.[attrName];
if (attribute === undefined) {
throw new Error(`Could not get attribute ${attrName} from ${this.#expression}.`);
}
return attribute;
}
optionalChild(propName, index = 0) {
return this.#optional(() => this.child(propName, index));
}
optionalChildValue(propName, index = 0) {
return this.#optional(() => this.childValue(propName, index));
}
optionalValue() {
return this.#optional(() => this.value());
}
}
export async function load(content) {
const rootData = await xml2js.parseStringPromise(content, {
tagNameProcessors: [xml2js.processors.stripPrefix],
trim: true
}).catch((ex) => {
ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message;
throw ex;
});
const rssData = rootData.rss;
if (rssData === undefined) {
throw new Error('Could not find <rss> root node. This likely means your import file is malformed.')
}
return new Data(rssData, 'rss');
}
+25 -36
View File
@@ -1,74 +1,63 @@
// get author, without decoding
// WordPress doesn't allow funky characters in usernames anyway
export function author(post) {
return post.data.creator[0];
// not decoded (WordPress doesn't allow funky characters in usernames anyway)
// surprisingly, does not always exist (squarespace exports, for example)
return post.data.optionalChildValue('creator');
}
// get array of decoded category names, excluding 'uncategorized'
export function categories(post) {
if (!post.data.category) {
return [];
}
const categories = post.data.category
.filter(category => category.$.domain === 'category')
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
return categories.filter((category) => category !== 'uncategorized');
// array of decoded category names, excluding 'uncategorized'
const categories = post.data.children('category');
return categories
.filter((category) => category.attribute('domain') === 'category' && category.attribute('nicename') !== 'uncategorized')
.map((category) => decodeURIComponent(category.attribute('nicename')));
}
// get cover image filename, previously decoded and set on post
// this one is unique as it relies on special logic executed by the parser
export function coverImage(post) {
// cover image filename, previously parsed and decoded
return post.coverImage;
}
// get post date, previously saved as a luxon datetime object on post
export function date(post) {
// a luxon datetime object, previously parsed
return post.date;
}
// get boolean indicating if post is a draft
// this will only be included if true, otherwise it's left off
export function draft(post) {
// boolean representing the previously parsed draft status, only included when true
return post.isDraft ? true : undefined;
}
// get excerpt, not decoded, newlines collapsed
export function excerpt(post) {
return post.data.encoded[1].replace(/[\r\n]+/gm, ' ');
// not decoded, newlines collapsed
// does not always exist (squarespace exports, for example)
const encoded = post.data.optionalChildValue('encoded', 1);
return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined;
}
// get ID, as an integer
export function id(post) {
// previously parsed as a string, converted to integer here
return parseInt(post.id);
}
// get slug, previously decoded and set on post
export function slug(post) {
// previously parsed and decoded
return post.slug;
}
// get array of decoded tag names
export function tags(post) {
if (!post.data.category) {
return [];
}
const categories = post.data.category
.filter(category => category.$.domain === 'post_tag')
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
return categories;
// array of decoded tag names (yes, they come from <category> nodes, not a typo)
const categories = post.data.children('category');
return categories
.filter((category) => category.attribute('domain') === 'post_tag')
.map((category) => decodeURIComponent(category.attribute('nicename')));
}
// get simple post title, but not decoded like other frontmatter string fields
export function title(post) {
return post.data.title[0];
// not decoded
return post.data.childValue('title');
}
// get type, often this will always be "post"
// but can also be "page" or other custom types
export function type(post) {
// previously parsed but not decoded, can be "post", "page", or other custom types
return post.type;
}
+63 -49
View File
@@ -1,6 +1,6 @@
import fs from 'fs';
import * as luxon from 'luxon';
import xml2js from 'xml2js';
import * as data from './data.js';
import * as frontmatter from './frontmatter.js';
import * as shared from './shared.js';
import * as translator from './translator.js';
@@ -8,21 +8,18 @@ import * as translator from './translator.js';
export async function parseFilePromise() {
console.log('\nParsing...');
const content = await fs.promises.readFile(shared.config.input, 'utf8');
const allData = await xml2js.parseStringPromise(content, {
trim: true,
tagNameProcessors: [xml2js.processors.stripPrefix]
});
const channelData = allData.rss.channel[0].item;
const rssData = await data.load(content);
const allPostData = rssData.child('channel').children('item');
const postTypes = getPostTypes(channelData);
const posts = collectPosts(channelData, postTypes);
const postTypes = getPostTypes(allPostData);
const posts = collectPosts(allPostData, postTypes);
const images = [];
if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') {
images.push(...collectAttachedImages(channelData));
images.push(...collectAttachedImages(allPostData));
}
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
images.push(...collectScrapedImages(channelData, postTypes));
images.push(...collectScrapedImages(allPostData, postTypes));
}
mergeImagesIntoPosts(images, posts);
@@ -31,11 +28,11 @@ export async function parseFilePromise() {
return posts;
}
function getPostTypes(channelData) {
function getPostTypes(allPostData) {
// search export file for all post types minus some specific types we don't want
const types = channelData
.map(item => item.post_type[0])
.filter(type => ![
const postTypes = allPostData
.map((postData) => postData.childValue('post_type'))
.filter((postType) => ![
'attachment',
'revision',
'nav_menu_item',
@@ -48,20 +45,20 @@ function getPostTypes(channelData) {
'wp_navigation',
'wp_template',
'wp_template_part'
].includes(type));
return [...new Set(types)]; // remove duplicates
].includes(postType));
return [...new Set(postTypes)]; // remove duplicates
}
function getItemsOfType(channelData, type) {
return channelData.filter(item => item.post_type[0] === type);
function getItemsOfType(allPostData, type) {
return allPostData.filter(item => item.childValue('post_type') === type);
}
function collectPosts(channelData, postTypes) {
function collectPosts(allPostData, postTypes) {
let allPosts = [];
postTypes.forEach(postType => {
const postsForType = getItemsOfType(channelData, postType)
.filter(postData => postData.status[0] !== 'trash')
.filter(postData => !(postType === 'page' && postData.post_name[0] === 'sample-page'))
const postsForType = getItemsOfType(allPostData, postType)
.filter(postData => postData.childValue('status') !== 'trash')
.filter(postData => !(postType === 'page' && postData.childValue('post_name') === 'sample-page'))
.map(postData => buildPost(postData));
if (postsForType.length > 0) {
@@ -80,15 +77,15 @@ function buildPost(data) {
data,
// body content converted to markdown
content: translator.getPostContent(data.encoded[0]),
content: translator.getPostContent(data.childValue('encoded')),
// particularly useful values for all sorts of things
type: data.post_type[0],
id: data.post_id[0],
isDraft: data.status[0] === 'draft',
slug: decodeURIComponent(data.post_name[0]),
type: data.childValue('post_type'),
id: data.childValue('post_id'),
isDraft: data.childValue('status') === 'draft',
slug: decodeURIComponent(data.childValue('post_name')),
date: getPostDate(data),
coverImageId: getPostMetaValue(data.postmeta, '_thumbnail_id'),
coverImageId: getPostMetaValue(data, '_thumbnail_id'),
// these are possibly set later in mergeImagesIntoPosts()
coverImage: undefined,
@@ -97,44 +94,57 @@ function buildPost(data) {
}
function getPostDate(data) {
const date = luxon.DateTime.fromRFC2822(data.pubDate[0] ?? '', { zone: shared.config.customDateTimezone });
const date = luxon.DateTime.fromRFC2822(data.childValue('pubDate'), { zone: shared.config.customDateTimezone });
return date.isValid ? date : undefined;
}
function getPostMetaValue(metas, key) {
const meta = metas && metas.find((meta) => meta.meta_key[0] === key);
return meta ? meta.meta_value[0] : undefined;
function getPostMetaValue(data, key) {
const metas = data.children('postmeta');
const meta = metas.find((meta) => meta.childValue('meta_key') === key);
return meta ? meta.childValue('meta_value') : undefined;
}
function collectAttachedImages(channelData) {
const images = getItemsOfType(channelData, 'attachment')
function collectAttachedImages(allPostData) {
const images = getItemsOfType(allPostData, 'attachment')
// filter to certain image file types
.filter(attachment => attachment.attachment_url && (/\.(gif|jpe?g|png|webp)$/i).test(attachment.attachment_url[0]))
.filter(attachment => {
const url = attachment.childValue('attachment_url');
return url && (/\.(gif|jpe?g|png|webp)$/i).test(url);
})
.map(attachment => ({
id: attachment.post_id[0],
postId: attachment.post_parent[0],
url: attachment.attachment_url[0]
id: attachment.childValue('post_id'),
postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example)
url: attachment.childValue('attachment_url')
}));
console.log(images.length + ' attached images found.');
return images;
}
function collectScrapedImages(channelData, postTypes) {
function collectScrapedImages(allPostData, postTypes) {
const images = [];
postTypes.forEach(postType => {
getItemsOfType(channelData, postType).forEach(postData => {
const postId = postData.post_id[0];
const postContent = postData.encoded[0];
const postLink = postData.link[0];
getItemsOfType(allPostData, postType).forEach(postData => {
const postId = postData.childValue('post_id');
const postContent = postData.childValue('encoded');
const scrapedUrls = [...postContent.matchAll(/<img\s[^>]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]);
scrapedUrls.forEach((scrapedUrl) => {
let url;
if (isAbsoluteUrl(scrapedUrl)) {
url = scrapedUrl;
} else {
const postLink = postData.childValue('link');
if (isAbsoluteUrl(postLink)) {
url = new URL(scrapedUrl, postLink).href;
} else {
throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`);
}
}
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)];
matches.forEach(match => {
// base the matched image URL relative to the post URL
const url = new URL(match[1], postLink).href;
images.push({
id: -1,
postId: postId,
id: 'nope', // scraped images don't have an id
postId,
url
});
});
@@ -184,3 +194,7 @@ function populateFrontmatter(posts) {
});
}
function isAbsoluteUrl(url) {
return (/^https?:\/\//i).test(url);
}
+1 -1
View File
@@ -119,7 +119,7 @@ export function load() {
{
name: 'markdown-file-write-delay',
type: 'integer',
default: 25
default: 10
},
{
name: 'include-time-with-date',
+14 -1
View File
@@ -14,6 +14,8 @@ function initTurndownService() {
turndownService.use(turndownPluginGfm.tables);
turndownService.remove(['style']); // <style> contents get dumped as plain text, would rather remove
// preserve embedded tweets
turndownService.addRule('tweet', {
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
@@ -34,6 +36,14 @@ function initTurndownService() {
replacement: (content, node) => '\n\n' + node.outerHTML
});
// <div> within <a> can cause extra whitespace that wreck markdown links, so this removes them
turndownService.addRule('a', {
filter: 'a',
replacement: (content) => {
return content.replace(/<\/?div[^>]*>/gi, '');
}
});
// preserve embedded scripts (for tweets, codepens, gists, etc.)
turndownService.addRule('script', {
filter: 'script',
@@ -107,7 +117,7 @@ export function getPostContent(content) {
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
content = content.replace(/(<img[^>]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
content = content.replace(/(<img\s[^>]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
}
// preserve "more" separator, max one per post, optionally with custom label
@@ -124,5 +134,8 @@ export function getPostContent(content) {
// clean up extra spaces in list items
content = content.replace(/(-|\d+\.) +/g, '$1 ');
// collapse excessive newlines (can happen with a lot of <div>)
content = content.replace(/(\r?\n){3,}/g, '\n\n');
return content;
}