diff --git a/src/data.js b/src/data.js new file mode 100644 index 0000000..55db05a --- /dev/null +++ b/src/data.js @@ -0,0 +1,108 @@ +import xml2js from 'xml2js'; + +class Data { + #obj; + #expression; + + constructor(obj, expression) { + // xml2js returns leaf nodes as strings, turn those into consistent objects + // I found this to be safer and more efficient than using the explicitCharkey option + this.#obj = typeof obj === 'string' ? { _: obj } : obj; + + // this identifies how the object was referenced, helps a ton with debugging + this.#expression = expression; + } + + #buildExpression(propName, index = undefined) { + let expression = `${this.#expression}.${propName}`; + if (index !== undefined) { + expression += `[${index}]`; + } + + return expression; + } + + // used by "optional" functions to return undefined instead of throwing an error + #optional(func) { + try { + return func(); + } catch (ex) { + return undefined; + } + } + + // will not throw an error if property doesn't exist, defaults to empty array + children(propName) { + const nodes = this.#obj[propName] ?? []; + return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index))); + } + + // throws an error if property (or index on property) doesn't exist + child(propName, index = 0) { + const nodes = this.#obj[propName]; + if (nodes === undefined) { + throw new Error(`Could not find ${this.#buildExpression(propName)}.`); + } + + const node = nodes[index]; + if (node === undefined) { + throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`); + } + + return new Data(node, this.#buildExpression(propName, index)); + } + + // convenience function, since it's very common to want the value of a child + childValue(propName, index = 0) { + return this.child(propName, index).value(); + } + + // throws an error if this object doesn't have a value string + value() { + const value = this.#obj._; + if (value === undefined) { + throw new Error(`Could not get value from ${this.#expression}.`); + } + + return value; + } + + // throws an error if attribute does not exist + attribute(attrName) { + const attribute = this.#obj.$?.[attrName]; + if (attribute === undefined) { + throw new Error(`Could not get attribute ${attrName} from ${this.#expression}.`); + } + + return attribute; + } + + optionalChild(propName, index = 0) { + return this.#optional(() => this.child(propName, index)); + } + + optionalChildValue(propName, index = 0) { + return this.#optional(() => this.childValue(propName, index)); + } + + optionalValue() { + return this.#optional(() => this.value()); + } +} + +export async function load(content) { + const rootData = await xml2js.parseStringPromise(content, { + tagNameProcessors: [xml2js.processors.stripPrefix], + trim: true + }).catch((ex) => { + ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message; + throw ex; + }); + + const rssData = rootData.rss; + if (rssData === undefined) { + throw new Error('Could not find root node. This likely means your import file is malformed.') + } + + return new Data(rssData, 'rss'); +} diff --git a/src/frontmatter.js b/src/frontmatter.js index 3e34b9e..22f37c9 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -1,74 +1,63 @@ -// get author, without decoding -// WordPress doesn't allow funky characters in usernames anyway export function author(post) { - return post.data.creator[0]; + // not decoded (WordPress doesn't allow funky characters in usernames anyway) + // surprisingly, does not always exist (squarespace exports, for example) + return post.data.optionalChildValue('creator'); } -// get array of decoded category names, excluding 'uncategorized' export function categories(post) { - if (!post.data.category) { - return []; - } - - const categories = post.data.category - .filter(category => category.$.domain === 'category') - .map(({ $: attributes }) => decodeURIComponent(attributes.nicename)); - - return categories.filter((category) => category !== 'uncategorized'); + // array of decoded category names, excluding 'uncategorized' + const categories = post.data.children('category'); + return categories + .filter((category) => category.attribute('domain') === 'category' && category.attribute('nicename') !== 'uncategorized') + .map((category) => decodeURIComponent(category.attribute('nicename'))); } -// get cover image filename, previously decoded and set on post -// this one is unique as it relies on special logic executed by the parser export function coverImage(post) { + // cover image filename, previously parsed and decoded return post.coverImage; } -// get post date, previously saved as a luxon datetime object on post export function date(post) { + // a luxon datetime object, previously parsed return post.date; } -// get boolean indicating if post is a draft -// this will only be included if true, otherwise it's left off export function draft(post) { + // boolean representing the previously parsed draft status, only included when true return post.isDraft ? true : undefined; } -// get excerpt, not decoded, newlines collapsed export function excerpt(post) { - return post.data.encoded[1].replace(/[\r\n]+/gm, ' '); + // not decoded, newlines collapsed + // does not always exist (squarespace exports, for example) + const encoded = post.data.optionalChildValue('encoded', 1); + return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined; } -// get ID, as an integer export function id(post) { + // previously parsed as a string, converted to integer here return parseInt(post.id); } -// get slug, previously decoded and set on post export function slug(post) { + // previously parsed and decoded return post.slug; } -// get array of decoded tag names export function tags(post) { - if (!post.data.category) { - return []; - } - - const categories = post.data.category - .filter(category => category.$.domain === 'post_tag') - .map(({ $: attributes }) => decodeURIComponent(attributes.nicename)); - - return categories; + // array of decoded tag names (yes, they come from nodes, not a typo) + const categories = post.data.children('category'); + return categories + .filter((category) => category.attribute('domain') === 'post_tag') + .map((category) => decodeURIComponent(category.attribute('nicename'))); } -// get simple post title, but not decoded like other frontmatter string fields export function title(post) { - return post.data.title[0]; + // not decoded + return post.data.childValue('title'); } -// get type, often this will always be "post" -// but can also be "page" or other custom types export function type(post) { + // previously parsed but not decoded, can be "post", "page", or other custom types return post.type; } diff --git a/src/parser.js b/src/parser.js index cd5cda9..e81c56f 100644 --- a/src/parser.js +++ b/src/parser.js @@ -1,6 +1,6 @@ import fs from 'fs'; import * as luxon from 'luxon'; -import xml2js from 'xml2js'; +import * as data from './data.js'; import * as frontmatter from './frontmatter.js'; import * as shared from './shared.js'; import * as translator from './translator.js'; @@ -8,21 +8,18 @@ import * as translator from './translator.js'; export async function parseFilePromise() { console.log('\nParsing...'); const content = await fs.promises.readFile(shared.config.input, 'utf8'); - const allData = await xml2js.parseStringPromise(content, { - trim: true, - tagNameProcessors: [xml2js.processors.stripPrefix] - }); - const channelData = allData.rss.channel[0].item; + const rssData = await data.load(content); + const allPostData = rssData.child('channel').children('item'); - const postTypes = getPostTypes(channelData); - const posts = collectPosts(channelData, postTypes); + const postTypes = getPostTypes(allPostData); + const posts = collectPosts(allPostData, postTypes); const images = []; if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') { - images.push(...collectAttachedImages(channelData)); + images.push(...collectAttachedImages(allPostData)); } if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { - images.push(...collectScrapedImages(channelData, postTypes)); + images.push(...collectScrapedImages(allPostData, postTypes)); } mergeImagesIntoPosts(images, posts); @@ -31,11 +28,11 @@ export async function parseFilePromise() { return posts; } -function getPostTypes(channelData) { +function getPostTypes(allPostData) { // search export file for all post types minus some specific types we don't want - const types = channelData - .map(item => item.post_type[0]) - .filter(type => ![ + const postTypes = allPostData + .map((postData) => postData.childValue('post_type')) + .filter((postType) => ![ 'attachment', 'revision', 'nav_menu_item', @@ -48,20 +45,20 @@ function getPostTypes(channelData) { 'wp_navigation', 'wp_template', 'wp_template_part' - ].includes(type)); - return [...new Set(types)]; // remove duplicates + ].includes(postType)); + return [...new Set(postTypes)]; // remove duplicates } -function getItemsOfType(channelData, type) { - return channelData.filter(item => item.post_type[0] === type); +function getItemsOfType(allPostData, type) { + return allPostData.filter(item => item.childValue('post_type') === type); } -function collectPosts(channelData, postTypes) { +function collectPosts(allPostData, postTypes) { let allPosts = []; postTypes.forEach(postType => { - const postsForType = getItemsOfType(channelData, postType) - .filter(postData => postData.status[0] !== 'trash') - .filter(postData => !(postType === 'page' && postData.post_name[0] === 'sample-page')) + const postsForType = getItemsOfType(allPostData, postType) + .filter(postData => postData.childValue('status') !== 'trash') + .filter(postData => !(postType === 'page' && postData.childValue('post_name') === 'sample-page')) .map(postData => buildPost(postData)); if (postsForType.length > 0) { @@ -80,15 +77,15 @@ function buildPost(data) { data, // body content converted to markdown - content: translator.getPostContent(data.encoded[0]), + content: translator.getPostContent(data.childValue('encoded')), // particularly useful values for all sorts of things - type: data.post_type[0], - id: data.post_id[0], - isDraft: data.status[0] === 'draft', - slug: decodeURIComponent(data.post_name[0]), + type: data.childValue('post_type'), + id: data.childValue('post_id'), + isDraft: data.childValue('status') === 'draft', + slug: decodeURIComponent(data.childValue('post_name')), date: getPostDate(data), - coverImageId: getPostMetaValue(data.postmeta, '_thumbnail_id'), + coverImageId: getPostMetaValue(data, '_thumbnail_id'), // these are possibly set later in mergeImagesIntoPosts() coverImage: undefined, @@ -97,44 +94,57 @@ function buildPost(data) { } function getPostDate(data) { - const date = luxon.DateTime.fromRFC2822(data.pubDate[0] ?? '', { zone: shared.config.customDateTimezone }); + const date = luxon.DateTime.fromRFC2822(data.childValue('pubDate'), { zone: shared.config.customDateTimezone }); return date.isValid ? date : undefined; } -function getPostMetaValue(metas, key) { - const meta = metas && metas.find((meta) => meta.meta_key[0] === key); - return meta ? meta.meta_value[0] : undefined; +function getPostMetaValue(data, key) { + const metas = data.children('postmeta'); + const meta = metas.find((meta) => meta.childValue('meta_key') === key); + return meta ? meta.childValue('meta_value') : undefined; } -function collectAttachedImages(channelData) { - const images = getItemsOfType(channelData, 'attachment') +function collectAttachedImages(allPostData) { + const images = getItemsOfType(allPostData, 'attachment') // filter to certain image file types - .filter(attachment => attachment.attachment_url && (/\.(gif|jpe?g|png|webp)$/i).test(attachment.attachment_url[0])) + .filter(attachment => { + const url = attachment.childValue('attachment_url'); + return url && (/\.(gif|jpe?g|png|webp)$/i).test(url); + }) .map(attachment => ({ - id: attachment.post_id[0], - postId: attachment.post_parent[0], - url: attachment.attachment_url[0] + id: attachment.childValue('post_id'), + postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example) + url: attachment.childValue('attachment_url') })); console.log(images.length + ' attached images found.'); return images; } -function collectScrapedImages(channelData, postTypes) { +function collectScrapedImages(allPostData, postTypes) { const images = []; postTypes.forEach(postType => { - getItemsOfType(channelData, postType).forEach(postData => { - const postId = postData.post_id[0]; - const postContent = postData.encoded[0]; - const postLink = postData.link[0]; + getItemsOfType(allPostData, postType).forEach(postData => { + const postId = postData.childValue('post_id'); + + const postContent = postData.childValue('encoded'); + const scrapedUrls = [...postContent.matchAll(/]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]); + scrapedUrls.forEach((scrapedUrl) => { + let url; + if (isAbsoluteUrl(scrapedUrl)) { + url = scrapedUrl; + } else { + const postLink = postData.childValue('link'); + if (isAbsoluteUrl(postLink)) { + url = new URL(scrapedUrl, postLink).href; + } else { + throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`); + } + } - const matches = [...postContent.matchAll(/]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)]; - matches.forEach(match => { - // base the matched image URL relative to the post URL - const url = new URL(match[1], postLink).href; images.push({ - id: -1, - postId: postId, + id: 'nope', // scraped images don't have an id + postId, url }); }); @@ -184,3 +194,7 @@ function populateFrontmatter(posts) { }); } +function isAbsoluteUrl(url) { + return (/^https?:\/\//i).test(url); +} + diff --git a/src/questions.js b/src/questions.js index 088f886..63d9b7a 100644 --- a/src/questions.js +++ b/src/questions.js @@ -119,7 +119,7 @@ export function load() { { name: 'markdown-file-write-delay', type: 'integer', - default: 25 + default: 10 }, { name: 'include-time-with-date', diff --git a/src/translator.js b/src/translator.js index a4f3bb1..6c31b81 100644 --- a/src/translator.js +++ b/src/translator.js @@ -14,6 +14,8 @@ function initTurndownService() { turndownService.use(turndownPluginGfm.tables); + turndownService.remove(['style']); //