diff --git a/src/data.js b/src/data.js index 2c1d4b6..55db05a 100644 --- a/src/data.js +++ b/src/data.js @@ -5,20 +5,15 @@ class Data { #expression; constructor(obj, expression) { + // xml2js returns leaf nodes as strings, turn those into consistent objects + // I found this to be safer and more efficient than using the explicitCharkey option this.#obj = typeof obj === 'string' ? { _: obj } : obj; + + // this identifies how the object was referenced, helps a ton with debugging this.#expression = expression; } - get value() { - const value = this.#obj._; - if (value === undefined) { - throw new Error(`Could not get value from ${this.#expression}.`); - } - - return value; - } - - #buildExpression(propName, index) { + #buildExpression(propName, index = undefined) { let expression = `${this.#expression}.${propName}`; if (index !== undefined) { expression += `[${index}]`; @@ -27,11 +22,22 @@ class Data { return expression; } + // used by "optional" functions to return undefined instead of throwing an error + #optional(func) { + try { + return func(); + } catch (ex) { + return undefined; + } + } + + // will not throw an error if property doesn't exist, defaults to empty array children(propName) { const nodes = this.#obj[propName] ?? []; return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index))); } + // throws an error if property (or index on property) doesn't exist child(propName, index = 0) { const nodes = this.#obj[propName]; if (nodes === undefined) { @@ -46,10 +52,22 @@ class Data { return new Data(node, this.#buildExpression(propName, index)); } + // convenience function, since it's very common to want the value of a child childValue(propName, index = 0) { - return this.child(propName, index).value; + return this.child(propName, index).value(); + } + + // throws an error if this object doesn't have a value string + value() { + const value = this.#obj._; + if (value === undefined) { + throw new Error(`Could not get value from ${this.#expression}.`); + } + + return value; } + // throws an error if attribute does not exist attribute(attrName) { const attribute = this.#obj.$?.[attrName]; if (attribute === undefined) { @@ -58,6 +76,18 @@ class Data { return attribute; } + + optionalChild(propName, index = 0) { + return this.#optional(() => this.child(propName, index)); + } + + optionalChildValue(propName, index = 0) { + return this.#optional(() => this.childValue(propName, index)); + } + + optionalValue() { + return this.#optional(() => this.value()); + } } export async function load(content) { diff --git a/src/frontmatter.js b/src/frontmatter.js index 74aede8..22f37c9 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -1,6 +1,7 @@ export function author(post) { - // not decoded, WordPress doesn't allow funky characters in usernames anyway - return post.data.childValue('creator'); + // not decoded (WordPress doesn't allow funky characters in usernames anyway) + // surprisingly, does not always exist (squarespace exports, for example) + return post.data.optionalChildValue('creator'); } export function categories(post) { @@ -28,7 +29,9 @@ export function draft(post) { export function excerpt(post) { // not decoded, newlines collapsed - return post.data.childValue('encoded', 1).replace(/[\r\n]+/gm, ' '); + // does not always exist (squarespace exports, for example) + const encoded = post.data.optionalChildValue('encoded', 1); + return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined; } export function id(post) { diff --git a/src/parser.js b/src/parser.js index 023b4da..915c97b 100644 --- a/src/parser.js +++ b/src/parser.js @@ -115,7 +115,7 @@ function collectAttachedImages(allPostData) { }) .map(attachment => ({ id: attachment.childValue('post_id'), - postId: attachment.childValue('post_parent'), + postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example) url: attachment.childValue('attachment_url') })); @@ -128,16 +128,25 @@ function collectScrapedImages(allPostData, postTypes) { postTypes.forEach(postType => { getItemsOfType(allPostData, postType).forEach(postData => { const postId = postData.childValue('post_id'); + const postContent = postData.childValue('encoded'); - const postLink = postData.childValue('link'); + const scrapedUrls = [...postContent.matchAll(/]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]); + scrapedUrls.forEach((scrapedUrl) => { + let url; + if (isAbsoluteUrl(scrapedUrl)) { + url = scrapedUrl; + } else { + const postLink = postData.childValue('link'); + if (isAbsoluteUrl(postLink)) { + url = new URL(scrapedUrl, postLink).href; + } else { + throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`); + } + } - const matches = [...postContent.matchAll(/]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)]; - matches.forEach(match => { - // base the matched image URL relative to the post URL - const url = new URL(match[1], postLink).href; images.push({ - id: -1, - postId: postId, + id: 'nope', // scraped images don't have an id + postId, url }); }); @@ -187,3 +196,7 @@ function populateFrontmatter(posts) { }); } +function isAbsoluteUrl(url) { + return (/^https?:\/\//i).test(url); +} + diff --git a/src/translator.js b/src/translator.js index a4f3bb1..a317f4b 100644 --- a/src/translator.js +++ b/src/translator.js @@ -107,7 +107,7 @@ export function getPostContent(content) { if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { // writeImageFile() will save all content images to a relative /images // folder so update references in post content to match - content = content.replace(/(]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3'); + content = content.replace(/(]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3'); } // preserve "more" separator, max one per post, optionally with custom label