diff --git a/src/data.js b/src/data.js
index 2c1d4b6..55db05a 100644
--- a/src/data.js
+++ b/src/data.js
@@ -5,20 +5,15 @@ class Data {
#expression;
constructor(obj, expression) {
+ // xml2js returns leaf nodes as strings, turn those into consistent objects
+ // I found this to be safer and more efficient than using the explicitCharkey option
this.#obj = typeof obj === 'string' ? { _: obj } : obj;
+
+ // this identifies how the object was referenced, helps a ton with debugging
this.#expression = expression;
}
- get value() {
- const value = this.#obj._;
- if (value === undefined) {
- throw new Error(`Could not get value from ${this.#expression}.`);
- }
-
- return value;
- }
-
- #buildExpression(propName, index) {
+ #buildExpression(propName, index = undefined) {
let expression = `${this.#expression}.${propName}`;
if (index !== undefined) {
expression += `[${index}]`;
@@ -27,11 +22,22 @@ class Data {
return expression;
}
+ // used by "optional" functions to return undefined instead of throwing an error
+ #optional(func) {
+ try {
+ return func();
+ } catch (ex) {
+ return undefined;
+ }
+ }
+
+ // will not throw an error if property doesn't exist, defaults to empty array
children(propName) {
const nodes = this.#obj[propName] ?? [];
return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index)));
}
+ // throws an error if property (or index on property) doesn't exist
child(propName, index = 0) {
const nodes = this.#obj[propName];
if (nodes === undefined) {
@@ -46,10 +52,22 @@ class Data {
return new Data(node, this.#buildExpression(propName, index));
}
+ // convenience function, since it's very common to want the value of a child
childValue(propName, index = 0) {
- return this.child(propName, index).value;
+ return this.child(propName, index).value();
+ }
+
+ // throws an error if this object doesn't have a value string
+ value() {
+ const value = this.#obj._;
+ if (value === undefined) {
+ throw new Error(`Could not get value from ${this.#expression}.`);
+ }
+
+ return value;
}
+ // throws an error if attribute does not exist
attribute(attrName) {
const attribute = this.#obj.$?.[attrName];
if (attribute === undefined) {
@@ -58,6 +76,18 @@ class Data {
return attribute;
}
+
+ optionalChild(propName, index = 0) {
+ return this.#optional(() => this.child(propName, index));
+ }
+
+ optionalChildValue(propName, index = 0) {
+ return this.#optional(() => this.childValue(propName, index));
+ }
+
+ optionalValue() {
+ return this.#optional(() => this.value());
+ }
}
export async function load(content) {
diff --git a/src/frontmatter.js b/src/frontmatter.js
index 74aede8..22f37c9 100644
--- a/src/frontmatter.js
+++ b/src/frontmatter.js
@@ -1,6 +1,7 @@
export function author(post) {
- // not decoded, WordPress doesn't allow funky characters in usernames anyway
- return post.data.childValue('creator');
+ // not decoded (WordPress doesn't allow funky characters in usernames anyway)
+ // surprisingly, does not always exist (squarespace exports, for example)
+ return post.data.optionalChildValue('creator');
}
export function categories(post) {
@@ -28,7 +29,9 @@ export function draft(post) {
export function excerpt(post) {
// not decoded, newlines collapsed
- return post.data.childValue('encoded', 1).replace(/[\r\n]+/gm, ' ');
+ // does not always exist (squarespace exports, for example)
+ const encoded = post.data.optionalChildValue('encoded', 1);
+ return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined;
}
export function id(post) {
diff --git a/src/parser.js b/src/parser.js
index 023b4da..915c97b 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -115,7 +115,7 @@ function collectAttachedImages(allPostData) {
})
.map(attachment => ({
id: attachment.childValue('post_id'),
- postId: attachment.childValue('post_parent'),
+ postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example)
url: attachment.childValue('attachment_url')
}));
@@ -128,16 +128,25 @@ function collectScrapedImages(allPostData, postTypes) {
postTypes.forEach(postType => {
getItemsOfType(allPostData, postType).forEach(postData => {
const postId = postData.childValue('post_id');
+
const postContent = postData.childValue('encoded');
- const postLink = postData.childValue('link');
+ const scrapedUrls = [...postContent.matchAll(/
]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]);
+ scrapedUrls.forEach((scrapedUrl) => {
+ let url;
+ if (isAbsoluteUrl(scrapedUrl)) {
+ url = scrapedUrl;
+ } else {
+ const postLink = postData.childValue('link');
+ if (isAbsoluteUrl(postLink)) {
+ url = new URL(scrapedUrl, postLink).href;
+ } else {
+ throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`);
+ }
+ }
- const matches = [...postContent.matchAll(/
]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)];
- matches.forEach(match => {
- // base the matched image URL relative to the post URL
- const url = new URL(match[1], postLink).href;
images.push({
- id: -1,
- postId: postId,
+ id: 'nope', // scraped images don't have an id
+ postId,
url
});
});
@@ -187,3 +196,7 @@ function populateFrontmatter(posts) {
});
}
+function isAbsoluteUrl(url) {
+ return (/^https?:\/\//i).test(url);
+}
+
diff --git a/src/translator.js b/src/translator.js
index a4f3bb1..a317f4b 100644
--- a/src/translator.js
+++ b/src/translator.js
@@ -107,7 +107,7 @@ export function getPostContent(content) {
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
// writeImageFile() will save all content images to a relative /images
// folder so update references in post content to match
- content = content.replace(/(
]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
+ content = content.replace(/(
]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
}
// preserve "more" separator, max one per post, optionally with custom label