Merge pull request #149 from lonekorean/parse-debugging

Parse debugging
2026-06-05 15:09:59 +09:00 · 2025-02-25 17:12:14 -05:00
parent bc9d17b59e cb435c6fd5
commit 1be32e76a6
5 changed files with 211 additions and 87 deletions
@@ -0,0 +1,108 @@
+import xml2js from 'xml2js';
+
+class Data {
+	#obj;
+	#expression;
+
+	constructor(obj, expression) {
+		// xml2js returns leaf nodes as strings, turn those into consistent objects
+		// I found this to be safer and more efficient than using the explicitCharkey option
+		this.#obj = typeof obj === 'string' ? { _: obj } : obj;
+
+		// this identifies how the object was referenced, helps a ton with debugging
+		this.#expression = expression;
+	}
+
+	#buildExpression(propName, index = undefined) {
+		let expression = `${this.#expression}.${propName}`;
+		if (index !== undefined) {
+			expression += `[${index}]`;
+		}
+
+		return expression;
+	}
+
+	// used by "optional" functions to return undefined instead of throwing an error
+	#optional(func) {
+		try {
+			return func();
+		} catch (ex) {
+			return undefined;
+		}
+	}
+
+	// will not throw an error if property doesn't exist, defaults to empty array
+	children(propName) {
+		const nodes = this.#obj[propName] ?? [];
+		return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index)));
+	}
+
+	// throws an error if property (or index on property) doesn't exist
+	child(propName, index = 0) {
+		const nodes = this.#obj[propName];
+		if (nodes === undefined) {
+			throw new Error(`Could not find ${this.#buildExpression(propName)}.`);
+		}
+
+		const node = nodes[index];
+		if (node === undefined) {
+			throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`);
+		}
+
+		return new Data(node, this.#buildExpression(propName, index));
+	}
+
+	// convenience function, since it's very common to want the value of a child
+	childValue(propName, index = 0) {
+		return this.child(propName, index).value();
+	}
+	
+	// throws an error if this object doesn't have a value string
+	value() {
+		const value = this.#obj._;
+		if (value === undefined) {
+			throw new Error(`Could not get value from ${this.#expression}.`);
+		}
+
+		return value;
+	}
+
+	// throws an error if attribute does not exist
+	attribute(attrName) {
+		const attribute = this.#obj.$?.[attrName];
+		if (attribute === undefined) {
+			throw new Error(`Could not get attribute ${attrName} from ${this.#expression}.`);
+		}
+
+		return attribute;
+	}
+
+	optionalChild(propName, index = 0) {
+		return this.#optional(() => this.child(propName, index));
+	}
+
+	optionalChildValue(propName, index = 0) {
+		return this.#optional(() => this.childValue(propName, index));
+	}
+
+	optionalValue() {
+		return this.#optional(() => this.value());
+	}
+}
+
+export async function load(content) {
+	const rootData = await xml2js.parseStringPromise(content, {
+		tagNameProcessors: [xml2js.processors.stripPrefix],
+		trim: true
+	}).catch((ex) => {
+		ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message;
+		throw ex;
+	});
+
+	const rssData = rootData.rss;
+	if (rssData === undefined) {
+		throw new Error('Could not find <rss> root node. This likely means your import file is malformed.')
+	}
+
+	return new Data(rssData, 'rss');
+}
@@ -1,74 +1,63 @@
-// get author, without decoding
-// WordPress doesn't allow funky characters in usernames anyway
 export function author(post) {
-	return post.data.creator[0];
+	// not decoded (WordPress doesn't allow funky characters in usernames anyway)
+	// surprisingly, does not always exist (squarespace exports, for example)
+	return post.data.optionalChildValue('creator');
 }

-// get array of decoded category names, excluding 'uncategorized'
 export function categories(post) {
-	if (!post.data.category) {
-		return [];
-	}
-
-	const categories = post.data.category
-		.filter(category => category.$.domain === 'category')
-		.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
-
-	return categories.filter((category) => category !== 'uncategorized');
+	// array of decoded category names, excluding 'uncategorized'
+	const categories = post.data.children('category');
+	return categories
+		.filter((category) => category.attribute('domain') === 'category' && category.attribute('nicename') !== 'uncategorized')
+		.map((category) => decodeURIComponent(category.attribute('nicename')));
 }

-// get cover image filename, previously decoded and set on post
-// this one is unique as it relies on special logic executed by the parser
 export function coverImage(post) {
+	// cover image filename, previously parsed and decoded
 	return post.coverImage;
 }

-// get post date, previously saved as a luxon datetime object on post
 export function date(post) {
+	// a luxon datetime object, previously parsed
 	return post.date;
 }

-// get boolean indicating if post is a draft
-// this will only be included if true, otherwise it's left off
 export function draft(post) {
+	// boolean representing the previously parsed draft status, only included when true
 	return post.isDraft ? true : undefined;
 }

-// get excerpt, not decoded, newlines collapsed
 export function excerpt(post) {
-	return post.data.encoded[1].replace(/[\r\n]+/gm, ' ');
+	// not decoded, newlines collapsed
+	// does not always exist (squarespace exports, for example)
+	const encoded = post.data.optionalChildValue('encoded', 1);
+	return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined;
 }

-// get ID, as an integer
 export function id(post) {
+	// previously parsed as a string, converted to integer here
 	return parseInt(post.id);
 }

-// get slug, previously decoded and set on post
 export function slug(post) {
+	// previously parsed and decoded
 	return post.slug;
 }

-// get array of decoded tag names
 export function tags(post) {
-	if (!post.data.category) {
-		return [];
-	}
-
-	const categories = post.data.category
-		.filter(category => category.$.domain === 'post_tag')
-		.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
-
-	return categories;
+	// array of decoded tag names (yes, they come from <category> nodes, not a typo)
+	const categories = post.data.children('category');
+	return categories
+		.filter((category) => category.attribute('domain') === 'post_tag')
+		.map((category) => decodeURIComponent(category.attribute('nicename')));
 }

-// get simple post title, but not decoded like other frontmatter string fields
 export function title(post) {
-	return post.data.title[0];
+	// not decoded
+	return post.data.childValue('title');
 }

-// get type, often this will always be "post"
-// but can also be "page" or other custom types
 export function type(post) {
+	// previously parsed but not decoded, can be "post", "page", or other custom types
 	return post.type;
 }
@@ -1,6 +1,6 @@
 import fs from 'fs';
 import * as luxon from 'luxon';
-import xml2js from 'xml2js';
+import * as data from './data.js';
 import * as frontmatter from './frontmatter.js';
 import * as shared from './shared.js';
 import * as translator from './translator.js';
@@ -8,21 +8,18 @@ import * as translator from './translator.js';
 export async function parseFilePromise() {
 	console.log('\nParsing...');
 	const content = await fs.promises.readFile(shared.config.input, 'utf8');
-	const allData = await xml2js.parseStringPromise(content, {
-		trim: true,
-		tagNameProcessors: [xml2js.processors.stripPrefix]
-	});
-	const channelData = allData.rss.channel[0].item;
+	const rssData = await data.load(content);
+	const allPostData = rssData.child('channel').children('item');

-	const postTypes = getPostTypes(channelData);
-	const posts = collectPosts(channelData, postTypes);
+	const postTypes = getPostTypes(allPostData);
+	const posts = collectPosts(allPostData, postTypes);

 	const images = [];
 	if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') {
-		images.push(...collectAttachedImages(channelData));
+		images.push(...collectAttachedImages(allPostData));
 	}
 	if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
-		images.push(...collectScrapedImages(channelData, postTypes));
+		images.push(...collectScrapedImages(allPostData, postTypes));
 	}

 	mergeImagesIntoPosts(images, posts);
@@ -31,11 +28,11 @@ export async function parseFilePromise() {
 	return posts;
 }

-function getPostTypes(channelData) {
+function getPostTypes(allPostData) {
 	// search export file for all post types minus some specific types we don't want
-	const types = channelData
-		.map(item => item.post_type[0])
-		.filter(type => ![
+	const postTypes = allPostData
+		.map((postData) => postData.childValue('post_type'))
+		.filter((postType) => ![
 			'attachment',
 			'revision',
 			'nav_menu_item',
@@ -48,20 +45,20 @@ function getPostTypes(channelData) {
 			'wp_navigation',
 			'wp_template',
 			'wp_template_part'
-		].includes(type));
-	return [...new Set(types)]; // remove duplicates
+		].includes(postType));
+	return [...new Set(postTypes)]; // remove duplicates
 }

-function getItemsOfType(channelData, type) {
-	return channelData.filter(item => item.post_type[0] === type);
+function getItemsOfType(allPostData, type) {
+	return allPostData.filter(item => item.childValue('post_type') === type);
 }

-function collectPosts(channelData, postTypes) {
+function collectPosts(allPostData, postTypes) {
 	let allPosts = [];
 	postTypes.forEach(postType => {
-		const postsForType = getItemsOfType(channelData, postType)
-			.filter(postData => postData.status[0] !== 'trash')
-			.filter(postData => !(postType === 'page' && postData.post_name[0] === 'sample-page'))
+		const postsForType = getItemsOfType(allPostData, postType)
+			.filter(postData => postData.childValue('status') !== 'trash')
+			.filter(postData => !(postType === 'page' && postData.childValue('post_name') === 'sample-page'))
 			.map(postData => buildPost(postData));

 		if (postsForType.length > 0) {
@@ -80,15 +77,15 @@ function buildPost(data) {
 		data,

 		// body content converted to markdown
-		content: translator.getPostContent(data.encoded[0]),
+		content: translator.getPostContent(data.childValue('encoded')),

 		// particularly useful values for all sorts of things
-		type: data.post_type[0],
-		id: data.post_id[0],
-		isDraft: data.status[0] === 'draft',
-		slug: decodeURIComponent(data.post_name[0]),
+		type: data.childValue('post_type'),
+		id: data.childValue('post_id'),
+		isDraft: data.childValue('status') === 'draft',
+		slug: decodeURIComponent(data.childValue('post_name')),
 		date: getPostDate(data),
-		coverImageId: getPostMetaValue(data.postmeta, '_thumbnail_id'),
+		coverImageId: getPostMetaValue(data, '_thumbnail_id'),

 		// these are possibly set later in mergeImagesIntoPosts()
 		coverImage: undefined,
@@ -97,44 +94,57 @@ function buildPost(data) {
 }

 function getPostDate(data) {
-	const date = luxon.DateTime.fromRFC2822(data.pubDate[0] ?? '', { zone: shared.config.customDateTimezone });
+	const date = luxon.DateTime.fromRFC2822(data.childValue('pubDate'), { zone: shared.config.customDateTimezone });
 	return date.isValid ? date : undefined;
 }

-function getPostMetaValue(metas, key) {
-	const meta = metas && metas.find((meta) => meta.meta_key[0] === key);
-	return meta ? meta.meta_value[0] : undefined;
+function getPostMetaValue(data, key) {
+	const metas = data.children('postmeta');
+	const meta = metas.find((meta) => meta.childValue('meta_key') === key);
+	return meta ? meta.childValue('meta_value') : undefined;
 }

-function collectAttachedImages(channelData) {
-	const images = getItemsOfType(channelData, 'attachment')
+function collectAttachedImages(allPostData) {
+	const images = getItemsOfType(allPostData, 'attachment')
 		// filter to certain image file types
-		.filter(attachment => attachment.attachment_url && (/\.(gif|jpe?g|png|webp)$/i).test(attachment.attachment_url[0]))
+		.filter(attachment => {
+			const url = attachment.childValue('attachment_url');
+			return url && (/\.(gif|jpe?g|png|webp)$/i).test(url);
+		})
 		.map(attachment => ({
-			id: attachment.post_id[0],
-			postId: attachment.post_parent[0],
-			url: attachment.attachment_url[0]
+			id: attachment.childValue('post_id'),
+			postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example)
+			url: attachment.childValue('attachment_url')
 		}));

 	console.log(images.length + ' attached images found.');
 	return images;
 }

-function collectScrapedImages(channelData, postTypes) {
+function collectScrapedImages(allPostData, postTypes) {
 	const images = [];
 	postTypes.forEach(postType => {
-		getItemsOfType(channelData, postType).forEach(postData => {
-			const postId = postData.post_id[0];
-			const postContent = postData.encoded[0];
-			const postLink = postData.link[0];
+		getItemsOfType(allPostData, postType).forEach(postData => {
+			const postId = postData.childValue('post_id');
+			
+			const postContent = postData.childValue('encoded');
+			const scrapedUrls = [...postContent.matchAll(/<img\s[^>]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]);
+			scrapedUrls.forEach((scrapedUrl) => {
+				let url;
+				if (isAbsoluteUrl(scrapedUrl)) {
+					url = scrapedUrl;
+				} else {
+					const postLink = postData.childValue('link');
+					if (isAbsoluteUrl(postLink)) {
+						url = new URL(scrapedUrl, postLink).href;
+					} else {
+						throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`);
+					}
+				}

-			const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)];
-			matches.forEach(match => {
-				// base the matched image URL relative to the post URL
-				const url = new URL(match[1], postLink).href;
 				images.push({
-					id: -1,
-					postId: postId,
+					id: 'nope', // scraped images don't have an id
+					postId,
 					url
 				});
 			});
@@ -184,3 +194,7 @@ function populateFrontmatter(posts) {
 	});
 }

+function isAbsoluteUrl(url) {
+	return (/^https?:\/\//i).test(url);
+}
+
@@ -119,7 +119,7 @@ export function load() {
 		{
 			name: 'markdown-file-write-delay',
 			type: 'integer',
-			default: 25
+			default: 10
 		},
 		{
 			name: 'include-time-with-date',
@@ -14,6 +14,8 @@ function initTurndownService() {

 	turndownService.use(turndownPluginGfm.tables);

+	turndownService.remove(['style']); // <style> contents get dumped as plain text, would rather remove
+
 	// preserve embedded tweets
 	turndownService.addRule('tweet', {
 		filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
@@ -34,6 +36,14 @@ function initTurndownService() {
 		replacement: (content, node) => '\n\n' + node.outerHTML
 	});

+	// <div> within <a> can cause extra whitespace that wreck markdown links, so this removes them
+	turndownService.addRule('a', {
+		filter: 'a',
+		replacement: (content) => {
+			return content.replace(/<\/?div[^>]*>/gi, '');
+		}
+	});
+
 	// preserve embedded scripts (for tweets, codepens, gists, etc.)
 	turndownService.addRule('script', {
 		filter: 'script',
@@ -107,7 +117,7 @@ export function getPostContent(content) {
 	if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
 		// writeImageFile() will save all content images to a relative /images
 		// folder so update references in post content to match
-		content = content.replace(/(<img[^>]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
+		content = content.replace(/(<img\s[^>]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
 	}

 	// preserve "more" separator, max one per post, optionally with custom label
@@ -124,5 +134,8 @@ export function getPostContent(content) {
 	// clean up extra spaces in list items
 	content = content.replace(/(-|\d+\.) +/g, '$1 ');

+	// collapse excessive newlines (can happen with a lot of <div>)
+	content = content.replace(/(\r?\n){3,}/g, '\n\n');
+
 	return content;
 }