From 470ba2dc00926ed4437096266e6bf7721cea8ee3 Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Sun, 16 Feb 2025 14:43:22 -0500 Subject: [PATCH 1/9] Catch initial parsing errors, shared.getValue() --- src/parser.js | 44 ++++++++++++++++++++++++++++---------------- src/shared.js | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/src/parser.js b/src/parser.js index cd5cda9..053432c 100644 --- a/src/parser.js +++ b/src/parser.js @@ -8,21 +8,33 @@ import * as translator from './translator.js'; export async function parseFilePromise() { console.log('\nParsing...'); const content = await fs.promises.readFile(shared.config.input, 'utf8'); - const allData = await xml2js.parseStringPromise(content, { + + const rootData = await xml2js.parseStringPromise(content, { trim: true, tagNameProcessors: [xml2js.processors.stripPrefix] + }).catch((ex) => { + ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message; + throw ex; }); - const channelData = allData.rss.channel[0].item; - const postTypes = getPostTypes(channelData); - const posts = collectPosts(channelData, postTypes); + const rssData = rootData.rss; + if (rssData === undefined) { + throw new Error('Could not find root node. This likely means your import file is malformed.') + } + rssData['wetm-expression'] = 'rss'; + + const channelData = shared.getValue(rssData, 'channel', 0); + const allPostData = shared.getValue(channelData, 'item'); + + const postTypes = getPostTypes(allPostData); + const posts = collectPosts(allPostData, postTypes); const images = []; if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') { - images.push(...collectAttachedImages(channelData)); + images.push(...collectAttachedImages(allPostData)); } if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { - images.push(...collectScrapedImages(channelData, postTypes)); + images.push(...collectScrapedImages(allPostData, postTypes)); } mergeImagesIntoPosts(images, posts); @@ -31,9 +43,9 @@ export async function parseFilePromise() { return posts; } -function getPostTypes(channelData) { +function getPostTypes(allPostData) { // search export file for all post types minus some specific types we don't want - const types = channelData + const types = allPostData .map(item => item.post_type[0]) .filter(type => ![ 'attachment', @@ -52,14 +64,14 @@ function getPostTypes(channelData) { return [...new Set(types)]; // remove duplicates } -function getItemsOfType(channelData, type) { - return channelData.filter(item => item.post_type[0] === type); +function getItemsOfType(allPostData, type) { + return allPostData.filter(item => item.post_type[0] === type); } -function collectPosts(channelData, postTypes) { +function collectPosts(allPostData, postTypes) { let allPosts = []; postTypes.forEach(postType => { - const postsForType = getItemsOfType(channelData, postType) + const postsForType = getItemsOfType(allPostData, postType) .filter(postData => postData.status[0] !== 'trash') .filter(postData => !(postType === 'page' && postData.post_name[0] === 'sample-page')) .map(postData => buildPost(postData)); @@ -106,8 +118,8 @@ function getPostMetaValue(metas, key) { return meta ? meta.meta_value[0] : undefined; } -function collectAttachedImages(channelData) { - const images = getItemsOfType(channelData, 'attachment') +function collectAttachedImages(allPostData) { + const images = getItemsOfType(allPostData, 'attachment') // filter to certain image file types .filter(attachment => attachment.attachment_url && (/\.(gif|jpe?g|png|webp)$/i).test(attachment.attachment_url[0])) .map(attachment => ({ @@ -120,10 +132,10 @@ function collectAttachedImages(channelData) { return images; } -function collectScrapedImages(channelData, postTypes) { +function collectScrapedImages(allPostData, postTypes) { const images = []; postTypes.forEach(postType => { - getItemsOfType(channelData, postType).forEach(postData => { + getItemsOfType(allPostData, postType).forEach(postData => { const postId = postData.post_id[0]; const postContent = postData.encoded[0]; const postLink = postData.link[0]; diff --git a/src/shared.js b/src/shared.js index 38316af..1a2216d 100644 --- a/src/shared.js +++ b/src/shared.js @@ -7,6 +7,44 @@ export function camelCase(str) { return str.replace(/-(.)/g, (match) => match[1].toUpperCase()); } +export function getValue(obj, propName, index) { + if (obj === undefined) { + throw new Error(`Could not find undefined.${propName}.`) + } + + let expression = `${obj['wetm-expression'] ?? 'object'}.${propName}`; + + const values = obj[propName]; + if (values === undefined) { + throw new Error(`Could not find ${expression}.`) + } + + if (index === undefined) { + values.forEach((value, index) => { + value['wetm-expression'] = `${expression}[${index}]`; + }); + return values; + } else { + expression += `[${index}]`; + + const value = values[index]; + if (value === undefined) { + throw new Error(`Could not find ${expression}.`) + } + + value['wetm-expression'] = expression; + return value; + } +} + +export function getOptionalValue(obj, propName, index) { + try { + return getValue(obj, propName, index); + } catch (ex) { + return undefined; + } +} + export function getSlugWithFallback(post) { return post.slug ? post.slug : 'id-' + post.id; } From 922515ec23261f10c26654f43a8802b8853a597b Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Mon, 17 Feb 2025 16:24:24 -0500 Subject: [PATCH 2/9] Use shared.getValue() wherever relevant --- src/frontmatter.js | 60 +++++++++++++++++++--------------------------- src/parser.js | 48 ++++++++++++++++++++----------------- src/shared.js | 6 ++++- 3 files changed, 55 insertions(+), 59 deletions(-) diff --git a/src/frontmatter.js b/src/frontmatter.js index 3e34b9e..bae2b64 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -1,74 +1,62 @@ -// get author, without decoding -// WordPress doesn't allow funky characters in usernames anyway +import * as shared from './shared.js'; + export function author(post) { - return post.data.creator[0]; + // not decoded, WordPress doesn't allow funky characters in usernames anyway + return shared.getValue(post.data, 'creator', 0); } -// get array of decoded category names, excluding 'uncategorized' export function categories(post) { - if (!post.data.category) { - return []; - } - - const categories = post.data.category - .filter(category => category.$.domain === 'category') - .map(({ $: attributes }) => decodeURIComponent(attributes.nicename)); - - return categories.filter((category) => category !== 'uncategorized'); + // array of decoded category names, excluding 'uncategorized' + const categories = shared.getOptionalValue(post.data, 'category') ?? []; + return categories + .filter((category) => category.$.domain === 'category' && category.$.nicename !== 'uncategorized') + .map((category) => decodeURIComponent(category.$.nicename)); } -// get cover image filename, previously decoded and set on post -// this one is unique as it relies on special logic executed by the parser export function coverImage(post) { + // cover image filename, previously parsed and decoded return post.coverImage; } -// get post date, previously saved as a luxon datetime object on post export function date(post) { + // a luxon datetime object, previously parsed return post.date; } -// get boolean indicating if post is a draft -// this will only be included if true, otherwise it's left off export function draft(post) { + // boolean representing the previously parsed draft status, only included when true return post.isDraft ? true : undefined; } -// get excerpt, not decoded, newlines collapsed export function excerpt(post) { - return post.data.encoded[1].replace(/[\r\n]+/gm, ' '); + // not decoded, newlines collapsed + return shared.getValue(post.data, 'encoded', 1).replace(/[\r\n]+/gm, ' '); } -// get ID, as an integer export function id(post) { + // previously parsed as a string, converted to integer here return parseInt(post.id); } -// get slug, previously decoded and set on post export function slug(post) { + // previously parsed and decoded return post.slug; } -// get array of decoded tag names export function tags(post) { - if (!post.data.category) { - return []; - } - - const categories = post.data.category - .filter(category => category.$.domain === 'post_tag') - .map(({ $: attributes }) => decodeURIComponent(attributes.nicename)); - - return categories; + // array of decoded tag names (yes, they come from nodes, not a typo) + const categories = shared.getOptionalValue(post.data, 'category') ?? []; + return categories + .filter((category) => category.$.domain === 'post_tag') + .map((category) => decodeURIComponent(category.$.nicename)); } -// get simple post title, but not decoded like other frontmatter string fields export function title(post) { - return post.data.title[0]; + // not decoded + return shared.getValue(post.data, 'title', 0); } -// get type, often this will always be "post" -// but can also be "page" or other custom types export function type(post) { + // previously parsed but not decoded, can be "post", "page", or other custom types return post.type; } diff --git a/src/parser.js b/src/parser.js index 053432c..daed8a6 100644 --- a/src/parser.js +++ b/src/parser.js @@ -21,7 +21,7 @@ export async function parseFilePromise() { if (rssData === undefined) { throw new Error('Could not find root node. This likely means your import file is malformed.') } - rssData['wetm-expression'] = 'rss'; + rssData['wetm-expression'] = 'rss'; const channelData = shared.getValue(rssData, 'channel', 0); const allPostData = shared.getValue(channelData, 'item'); @@ -46,7 +46,7 @@ export async function parseFilePromise() { function getPostTypes(allPostData) { // search export file for all post types minus some specific types we don't want const types = allPostData - .map(item => item.post_type[0]) + .map(item => shared.getValue(item, 'post_type', 0)) .filter(type => ![ 'attachment', 'revision', @@ -65,15 +65,15 @@ function getPostTypes(allPostData) { } function getItemsOfType(allPostData, type) { - return allPostData.filter(item => item.post_type[0] === type); + return allPostData.filter(item => shared.getValue(item, 'post_type', 0) === type); } function collectPosts(allPostData, postTypes) { let allPosts = []; postTypes.forEach(postType => { const postsForType = getItemsOfType(allPostData, postType) - .filter(postData => postData.status[0] !== 'trash') - .filter(postData => !(postType === 'page' && postData.post_name[0] === 'sample-page')) + .filter(postData => shared.getValue(postData, 'status', 0) !== 'trash') + .filter(postData => !(postType === 'page' && shared.getValue(postData, 'post_name', 0) === 'sample-page')) .map(postData => buildPost(postData)); if (postsForType.length > 0) { @@ -92,15 +92,15 @@ function buildPost(data) { data, // body content converted to markdown - content: translator.getPostContent(data.encoded[0]), + content: translator.getPostContent(shared.getValue(data, 'encoded', 0)), // particularly useful values for all sorts of things - type: data.post_type[0], - id: data.post_id[0], - isDraft: data.status[0] === 'draft', - slug: decodeURIComponent(data.post_name[0]), + type: shared.getValue(data, 'post_type', 0), + id: shared.getValue(data, 'post_id', 0), + isDraft: shared.getValue(data, 'status', 0) === 'draft', + slug: decodeURIComponent(shared.getValue(data, 'post_name', 0)), date: getPostDate(data), - coverImageId: getPostMetaValue(data.postmeta, '_thumbnail_id'), + coverImageId: getPostMetaValue(data, '_thumbnail_id'), // these are possibly set later in mergeImagesIntoPosts() coverImage: undefined, @@ -109,23 +109,27 @@ function buildPost(data) { } function getPostDate(data) { - const date = luxon.DateTime.fromRFC2822(data.pubDate[0] ?? '', { zone: shared.config.customDateTimezone }); + const date = luxon.DateTime.fromRFC2822(shared.getValue(data, 'pubDate', 0) ?? '', { zone: shared.config.customDateTimezone }); return date.isValid ? date : undefined; } -function getPostMetaValue(metas, key) { - const meta = metas && metas.find((meta) => meta.meta_key[0] === key); - return meta ? meta.meta_value[0] : undefined; +function getPostMetaValue(data, key) { + const metas = shared.getOptionalValue(data, 'postmeta'); + const meta = metas && metas.find((meta) => shared.getValue(meta, 'meta_key', 0) === key); + return meta ? shared.getValue(meta, 'meta_value', 0) : undefined; } function collectAttachedImages(allPostData) { const images = getItemsOfType(allPostData, 'attachment') // filter to certain image file types - .filter(attachment => attachment.attachment_url && (/\.(gif|jpe?g|png|webp)$/i).test(attachment.attachment_url[0])) + .filter(attachment => { + const url = shared.getOptionalValue(attachment, 'attachment_url', 0); + return url && (/\.(gif|jpe?g|png|webp)$/i).test(url); + }) .map(attachment => ({ - id: attachment.post_id[0], - postId: attachment.post_parent[0], - url: attachment.attachment_url[0] + id: shared.getValue(attachment, 'post_id', 0), + postId: shared.getValue(attachment, 'post_parent', 0), + url: shared.getValue(attachment, 'attachment_url', 0) })); console.log(images.length + ' attached images found.'); @@ -136,9 +140,9 @@ function collectScrapedImages(allPostData, postTypes) { const images = []; postTypes.forEach(postType => { getItemsOfType(allPostData, postType).forEach(postData => { - const postId = postData.post_id[0]; - const postContent = postData.encoded[0]; - const postLink = postData.link[0]; + const postId = shared.getValue(postData, 'post_id', 0); + const postContent = shared.getValue(postData, 'encoded', 0); + const postLink = shared.getValue(postData, 'link', 0); const matches = [...postContent.matchAll(/]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)]; matches.forEach(match => { diff --git a/src/shared.js b/src/shared.js index 1a2216d..e15cb02 100644 --- a/src/shared.js +++ b/src/shared.js @@ -22,6 +22,7 @@ export function getValue(obj, propName, index) { if (index === undefined) { values.forEach((value, index) => { value['wetm-expression'] = `${expression}[${index}]`; + // console.log('>>>', value['wetm-expression']); }); return values; } else { @@ -32,7 +33,10 @@ export function getValue(obj, propName, index) { throw new Error(`Could not find ${expression}.`) } - value['wetm-expression'] = expression; + if (typeof value === 'object') { + value['wetm-expression'] = expression; + // console.log('>>>', value['wetm-expression']); + } return value; } } From aaafd6bd07e9e473a49b821196ea77e6c2b28cbb Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Fri, 21 Feb 2025 14:54:50 -0500 Subject: [PATCH 3/9] Data wrapper class --- src/data.js | 79 ++++++++++++++++++++++++++++++++++++++++++++++ src/frontmatter.js | 20 ++++++------ src/parser.js | 69 ++++++++++++++++------------------------ src/shared.js | 42 ------------------------ 4 files changed, 116 insertions(+), 94 deletions(-) create mode 100644 src/data.js diff --git a/src/data.js b/src/data.js new file mode 100644 index 0000000..44ce3f5 --- /dev/null +++ b/src/data.js @@ -0,0 +1,79 @@ +import xml2js from 'xml2js'; + +class Data { + #obj; + #expression; + + constructor(obj, expression) { + this.#obj = typeof obj === 'string' ? { _: obj } : obj; + this.#expression = expression; + } + + get value() { + const value = this.#obj._; + if (value === undefined) { + throw new Error(`Could not get value from ${this.#expression}.`); + } + + return value; + } + + #buildExpression(propName, index) { + let expression = `${this.#expression}.${propName}`; + if (index !== undefined) { + expression += `[${index}]`; + } + + return expression; + } + + #getPropArray(propName, isRequired) { + const propArray = this.#obj[propName]; + if (propArray === undefined && isRequired) { + throw new Error(`Could not find ${this.#buildExpression(propName)}.`); + } + + return propArray; + } + + getAll(propName, isRequired = true) { + const propArray = this.#getPropArray(propName, isRequired); + return propArray !== undefined ? propArray.map((value, index) => new Data(value, this.#buildExpression(propName, index))) : undefined; + } + + getSingle(propName, index, isRequired = true) { + const prop = (this.#getPropArray(propName, isRequired) ?? [])[index]; + + if (prop === undefined && isRequired) { + throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`) + } + + return prop !== undefined ? new Data(prop, this.#buildExpression(propName, index)) : undefined; + } + + getAttribute(attrName) { + const attribute = this.#obj.$?.[attrName]; + if (attribute === undefined) { + throw new Error(`Could not get attribute ${attrName} from ${this.#expression}.`); + } + + return attribute; + } +} + +export async function load(content) { + const rootData = await xml2js.parseStringPromise(content, { + tagNameProcessors: [xml2js.processors.stripPrefix], + trim: true + }).catch((ex) => { + ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message; + throw ex; + }); + + const rssData = rootData.rss; + if (rssData === undefined) { + throw new Error('Could not find root node. This likely means your import file is malformed.') + } + + return new Data(rssData, 'rss'); +} diff --git a/src/frontmatter.js b/src/frontmatter.js index bae2b64..e600198 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -1,16 +1,14 @@ -import * as shared from './shared.js'; - export function author(post) { // not decoded, WordPress doesn't allow funky characters in usernames anyway - return shared.getValue(post.data, 'creator', 0); + return post.data.getSingle('creator', 0).value; } export function categories(post) { // array of decoded category names, excluding 'uncategorized' - const categories = shared.getOptionalValue(post.data, 'category') ?? []; + const categories = post.data.getAll('category', false) ?? []; return categories - .filter((category) => category.$.domain === 'category' && category.$.nicename !== 'uncategorized') - .map((category) => decodeURIComponent(category.$.nicename)); + .filter((category) => category.getAttribute('domain') === 'category' && category.getAttribute('nicename') !== 'uncategorized') + .map((category) => decodeURIComponent(category.getAttribute('nicename'))); } export function coverImage(post) { @@ -30,7 +28,7 @@ export function draft(post) { export function excerpt(post) { // not decoded, newlines collapsed - return shared.getValue(post.data, 'encoded', 1).replace(/[\r\n]+/gm, ' '); + return post.data.getSingle('encoded', 1).value.replace(/[\r\n]+/gm, ' '); } export function id(post) { @@ -45,15 +43,15 @@ export function slug(post) { export function tags(post) { // array of decoded tag names (yes, they come from nodes, not a typo) - const categories = shared.getOptionalValue(post.data, 'category') ?? []; + const categories = post.data.getAll('category', false) ?? []; return categories - .filter((category) => category.$.domain === 'post_tag') - .map((category) => decodeURIComponent(category.$.nicename)); + .filter((category) => category.getAttribute('domain') === 'post_tag') + .map((category) => decodeURIComponent(category.getAttribute('nicename'))); } export function title(post) { // not decoded - return shared.getValue(post.data, 'title', 0); + return post.data.getSingle('title', 0).value; } export function type(post) { diff --git a/src/parser.js b/src/parser.js index daed8a6..924cc2c 100644 --- a/src/parser.js +++ b/src/parser.js @@ -1,6 +1,6 @@ import fs from 'fs'; import * as luxon from 'luxon'; -import xml2js from 'xml2js'; +import * as data from './data.js'; import * as frontmatter from './frontmatter.js'; import * as shared from './shared.js'; import * as translator from './translator.js'; @@ -8,23 +8,10 @@ import * as translator from './translator.js'; export async function parseFilePromise() { console.log('\nParsing...'); const content = await fs.promises.readFile(shared.config.input, 'utf8'); + const rssData = await data.load(content); - const rootData = await xml2js.parseStringPromise(content, { - trim: true, - tagNameProcessors: [xml2js.processors.stripPrefix] - }).catch((ex) => { - ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message; - throw ex; - }); - - const rssData = rootData.rss; - if (rssData === undefined) { - throw new Error('Could not find root node. This likely means your import file is malformed.') - } - rssData['wetm-expression'] = 'rss'; - - const channelData = shared.getValue(rssData, 'channel', 0); - const allPostData = shared.getValue(channelData, 'item'); + const channelData = rssData.getSingle('channel', 0); + const allPostData = channelData.getAll('item'); const postTypes = getPostTypes(allPostData); const posts = collectPosts(allPostData, postTypes); @@ -45,9 +32,9 @@ export async function parseFilePromise() { function getPostTypes(allPostData) { // search export file for all post types minus some specific types we don't want - const types = allPostData - .map(item => shared.getValue(item, 'post_type', 0)) - .filter(type => ![ + const postTypes = allPostData + .map((postData) => postData.getSingle('post_type', 0).value) + .filter((postType) => ![ 'attachment', 'revision', 'nav_menu_item', @@ -60,20 +47,20 @@ function getPostTypes(allPostData) { 'wp_navigation', 'wp_template', 'wp_template_part' - ].includes(type)); - return [...new Set(types)]; // remove duplicates + ].includes(postType)); + return [...new Set(postTypes)]; // remove duplicates } function getItemsOfType(allPostData, type) { - return allPostData.filter(item => shared.getValue(item, 'post_type', 0) === type); + return allPostData.filter(item => item.getSingle('post_type', 0).value === type); } function collectPosts(allPostData, postTypes) { let allPosts = []; postTypes.forEach(postType => { const postsForType = getItemsOfType(allPostData, postType) - .filter(postData => shared.getValue(postData, 'status', 0) !== 'trash') - .filter(postData => !(postType === 'page' && shared.getValue(postData, 'post_name', 0) === 'sample-page')) + .filter(postData => postData.getSingle('status', 0).value !== 'trash') + .filter(postData => !(postType === 'page' && postData.getSingle('post_name', 0).value === 'sample-page')) .map(postData => buildPost(postData)); if (postsForType.length > 0) { @@ -92,13 +79,13 @@ function buildPost(data) { data, // body content converted to markdown - content: translator.getPostContent(shared.getValue(data, 'encoded', 0)), + content: translator.getPostContent(data.getSingle('encoded', 0).value), // particularly useful values for all sorts of things - type: shared.getValue(data, 'post_type', 0), - id: shared.getValue(data, 'post_id', 0), - isDraft: shared.getValue(data, 'status', 0) === 'draft', - slug: decodeURIComponent(shared.getValue(data, 'post_name', 0)), + type: data.getSingle('post_type', 0).value, + id: data.getSingle('post_id', 0).value, + isDraft: data.getSingle('status', 0).value === 'draft', + slug: decodeURIComponent(data.getSingle('post_name', 0).value), date: getPostDate(data), coverImageId: getPostMetaValue(data, '_thumbnail_id'), @@ -109,27 +96,27 @@ function buildPost(data) { } function getPostDate(data) { - const date = luxon.DateTime.fromRFC2822(shared.getValue(data, 'pubDate', 0) ?? '', { zone: shared.config.customDateTimezone }); + const date = luxon.DateTime.fromRFC2822(data.getSingle('pubDate', 0).value ?? '', { zone: shared.config.customDateTimezone }); return date.isValid ? date : undefined; } function getPostMetaValue(data, key) { - const metas = shared.getOptionalValue(data, 'postmeta'); - const meta = metas && metas.find((meta) => shared.getValue(meta, 'meta_key', 0) === key); - return meta ? shared.getValue(meta, 'meta_value', 0) : undefined; + const metas = data.getAll('postmeta', false) ?? []; + const meta = metas.find((meta) => meta.getSingle('meta_key', 0).value === key); + return meta ? meta.getSingle('meta_value', 0).value : undefined; } function collectAttachedImages(allPostData) { const images = getItemsOfType(allPostData, 'attachment') // filter to certain image file types .filter(attachment => { - const url = shared.getOptionalValue(attachment, 'attachment_url', 0); + const url = attachment.getSingle('attachment_url', 0).value; return url && (/\.(gif|jpe?g|png|webp)$/i).test(url); }) .map(attachment => ({ - id: shared.getValue(attachment, 'post_id', 0), - postId: shared.getValue(attachment, 'post_parent', 0), - url: shared.getValue(attachment, 'attachment_url', 0) + id: attachment.getSingle('post_id', 0).value, + postId: attachment.getSingle('post_parent', 0).value, + url: attachment.getSingle('attachment_url', 0).value })); console.log(images.length + ' attached images found.'); @@ -140,9 +127,9 @@ function collectScrapedImages(allPostData, postTypes) { const images = []; postTypes.forEach(postType => { getItemsOfType(allPostData, postType).forEach(postData => { - const postId = shared.getValue(postData, 'post_id', 0); - const postContent = shared.getValue(postData, 'encoded', 0); - const postLink = shared.getValue(postData, 'link', 0); + const postId = postData.getSingle('post_id', 0).value; + const postContent = postData.getSingle('encoded', 0).value; + const postLink = postData.getSingle('link', 0).value; const matches = [...postContent.matchAll(/]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)]; matches.forEach(match => { diff --git a/src/shared.js b/src/shared.js index e15cb02..38316af 100644 --- a/src/shared.js +++ b/src/shared.js @@ -7,48 +7,6 @@ export function camelCase(str) { return str.replace(/-(.)/g, (match) => match[1].toUpperCase()); } -export function getValue(obj, propName, index) { - if (obj === undefined) { - throw new Error(`Could not find undefined.${propName}.`) - } - - let expression = `${obj['wetm-expression'] ?? 'object'}.${propName}`; - - const values = obj[propName]; - if (values === undefined) { - throw new Error(`Could not find ${expression}.`) - } - - if (index === undefined) { - values.forEach((value, index) => { - value['wetm-expression'] = `${expression}[${index}]`; - // console.log('>>>', value['wetm-expression']); - }); - return values; - } else { - expression += `[${index}]`; - - const value = values[index]; - if (value === undefined) { - throw new Error(`Could not find ${expression}.`) - } - - if (typeof value === 'object') { - value['wetm-expression'] = expression; - // console.log('>>>', value['wetm-expression']); - } - return value; - } -} - -export function getOptionalValue(obj, propName, index) { - try { - return getValue(obj, propName, index); - } catch (ex) { - return undefined; - } -} - export function getSlugWithFallback(post) { return post.slug ? post.slug : 'id-' + post.id; } From 841111f85061ff41e81e7cf0830671009469963a Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Fri, 21 Feb 2025 18:54:19 -0500 Subject: [PATCH 4/9] Data refactoring and renaming --- src/data.js | 35 +++++++++++++++++------------------ src/frontmatter.js | 18 +++++++++--------- src/parser.js | 44 ++++++++++++++++++++++---------------------- 3 files changed, 48 insertions(+), 49 deletions(-) diff --git a/src/data.js b/src/data.js index 44ce3f5..2c1d4b6 100644 --- a/src/data.js +++ b/src/data.js @@ -27,31 +27,30 @@ class Data { return expression; } - #getPropArray(propName, isRequired) { - const propArray = this.#obj[propName]; - if (propArray === undefined && isRequired) { + children(propName) { + const nodes = this.#obj[propName] ?? []; + return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index))); + } + + child(propName, index = 0) { + const nodes = this.#obj[propName]; + if (nodes === undefined) { throw new Error(`Could not find ${this.#buildExpression(propName)}.`); } - return propArray; - } - - getAll(propName, isRequired = true) { - const propArray = this.#getPropArray(propName, isRequired); - return propArray !== undefined ? propArray.map((value, index) => new Data(value, this.#buildExpression(propName, index))) : undefined; - } - - getSingle(propName, index, isRequired = true) { - const prop = (this.#getPropArray(propName, isRequired) ?? [])[index]; - - if (prop === undefined && isRequired) { - throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`) + const node = nodes[index]; + if (node === undefined) { + throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`); } - return prop !== undefined ? new Data(prop, this.#buildExpression(propName, index)) : undefined; + return new Data(node, this.#buildExpression(propName, index)); } - getAttribute(attrName) { + childValue(propName, index = 0) { + return this.child(propName, index).value; + } + + attribute(attrName) { const attribute = this.#obj.$?.[attrName]; if (attribute === undefined) { throw new Error(`Could not get attribute ${attrName} from ${this.#expression}.`); diff --git a/src/frontmatter.js b/src/frontmatter.js index e600198..74aede8 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -1,14 +1,14 @@ export function author(post) { // not decoded, WordPress doesn't allow funky characters in usernames anyway - return post.data.getSingle('creator', 0).value; + return post.data.childValue('creator'); } export function categories(post) { // array of decoded category names, excluding 'uncategorized' - const categories = post.data.getAll('category', false) ?? []; + const categories = post.data.children('category'); return categories - .filter((category) => category.getAttribute('domain') === 'category' && category.getAttribute('nicename') !== 'uncategorized') - .map((category) => decodeURIComponent(category.getAttribute('nicename'))); + .filter((category) => category.attribute('domain') === 'category' && category.attribute('nicename') !== 'uncategorized') + .map((category) => decodeURIComponent(category.attribute('nicename'))); } export function coverImage(post) { @@ -28,7 +28,7 @@ export function draft(post) { export function excerpt(post) { // not decoded, newlines collapsed - return post.data.getSingle('encoded', 1).value.replace(/[\r\n]+/gm, ' '); + return post.data.childValue('encoded', 1).replace(/[\r\n]+/gm, ' '); } export function id(post) { @@ -43,15 +43,15 @@ export function slug(post) { export function tags(post) { // array of decoded tag names (yes, they come from nodes, not a typo) - const categories = post.data.getAll('category', false) ?? []; + const categories = post.data.children('category'); return categories - .filter((category) => category.getAttribute('domain') === 'post_tag') - .map((category) => decodeURIComponent(category.getAttribute('nicename'))); + .filter((category) => category.attribute('domain') === 'post_tag') + .map((category) => decodeURIComponent(category.attribute('nicename'))); } export function title(post) { // not decoded - return post.data.getSingle('title', 0).value; + return post.data.childValue('title'); } export function type(post) { diff --git a/src/parser.js b/src/parser.js index 924cc2c..023b4da 100644 --- a/src/parser.js +++ b/src/parser.js @@ -10,8 +10,8 @@ export async function parseFilePromise() { const content = await fs.promises.readFile(shared.config.input, 'utf8'); const rssData = await data.load(content); - const channelData = rssData.getSingle('channel', 0); - const allPostData = channelData.getAll('item'); + const channelData = rssData.child('channel'); + const allPostData = channelData.children('item'); const postTypes = getPostTypes(allPostData); const posts = collectPosts(allPostData, postTypes); @@ -33,7 +33,7 @@ export async function parseFilePromise() { function getPostTypes(allPostData) { // search export file for all post types minus some specific types we don't want const postTypes = allPostData - .map((postData) => postData.getSingle('post_type', 0).value) + .map((postData) => postData.childValue('post_type')) .filter((postType) => ![ 'attachment', 'revision', @@ -52,15 +52,15 @@ function getPostTypes(allPostData) { } function getItemsOfType(allPostData, type) { - return allPostData.filter(item => item.getSingle('post_type', 0).value === type); + return allPostData.filter(item => item.childValue('post_type') === type); } function collectPosts(allPostData, postTypes) { let allPosts = []; postTypes.forEach(postType => { const postsForType = getItemsOfType(allPostData, postType) - .filter(postData => postData.getSingle('status', 0).value !== 'trash') - .filter(postData => !(postType === 'page' && postData.getSingle('post_name', 0).value === 'sample-page')) + .filter(postData => postData.childValue('status') !== 'trash') + .filter(postData => !(postType === 'page' && postData.childValue('post_name') === 'sample-page')) .map(postData => buildPost(postData)); if (postsForType.length > 0) { @@ -79,13 +79,13 @@ function buildPost(data) { data, // body content converted to markdown - content: translator.getPostContent(data.getSingle('encoded', 0).value), + content: translator.getPostContent(data.childValue('encoded')), // particularly useful values for all sorts of things - type: data.getSingle('post_type', 0).value, - id: data.getSingle('post_id', 0).value, - isDraft: data.getSingle('status', 0).value === 'draft', - slug: decodeURIComponent(data.getSingle('post_name', 0).value), + type: data.childValue('post_type'), + id: data.childValue('post_id'), + isDraft: data.childValue('status') === 'draft', + slug: decodeURIComponent(data.childValue('post_name')), date: getPostDate(data), coverImageId: getPostMetaValue(data, '_thumbnail_id'), @@ -96,27 +96,27 @@ function buildPost(data) { } function getPostDate(data) { - const date = luxon.DateTime.fromRFC2822(data.getSingle('pubDate', 0).value ?? '', { zone: shared.config.customDateTimezone }); + const date = luxon.DateTime.fromRFC2822(data.childValue('pubDate'), { zone: shared.config.customDateTimezone }); return date.isValid ? date : undefined; } function getPostMetaValue(data, key) { - const metas = data.getAll('postmeta', false) ?? []; - const meta = metas.find((meta) => meta.getSingle('meta_key', 0).value === key); - return meta ? meta.getSingle('meta_value', 0).value : undefined; + const metas = data.children('postmeta'); + const meta = metas.find((meta) => meta.childValue('meta_key') === key); + return meta ? meta.childValue('meta_value') : undefined; } function collectAttachedImages(allPostData) { const images = getItemsOfType(allPostData, 'attachment') // filter to certain image file types .filter(attachment => { - const url = attachment.getSingle('attachment_url', 0).value; + const url = attachment.childValue('attachment_url'); return url && (/\.(gif|jpe?g|png|webp)$/i).test(url); }) .map(attachment => ({ - id: attachment.getSingle('post_id', 0).value, - postId: attachment.getSingle('post_parent', 0).value, - url: attachment.getSingle('attachment_url', 0).value + id: attachment.childValue('post_id'), + postId: attachment.childValue('post_parent'), + url: attachment.childValue('attachment_url') })); console.log(images.length + ' attached images found.'); @@ -127,9 +127,9 @@ function collectScrapedImages(allPostData, postTypes) { const images = []; postTypes.forEach(postType => { getItemsOfType(allPostData, postType).forEach(postData => { - const postId = postData.getSingle('post_id', 0).value; - const postContent = postData.getSingle('encoded', 0).value; - const postLink = postData.getSingle('link', 0).value; + const postId = postData.childValue('post_id'); + const postContent = postData.childValue('encoded'); + const postLink = postData.childValue('link'); const matches = [...postContent.matchAll(/]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)]; matches.forEach(match => { From f0e8400ccd5e4de4a60d8b8c6b08ae227c68612b Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Fri, 21 Feb 2025 18:54:41 -0500 Subject: [PATCH 5/9] Decrease default write delay --- src/questions.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/questions.js b/src/questions.js index 088f886..63d9b7a 100644 --- a/src/questions.js +++ b/src/questions.js @@ -119,7 +119,7 @@ export function load() { { name: 'markdown-file-write-delay', type: 'integer', - default: 25 + default: 10 }, { name: 'include-time-with-date', From db5430117ef5214011e837625c2fc94430f72e83 Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Sun, 23 Feb 2025 13:22:32 -0500 Subject: [PATCH 6/9] Data fixes, fix for with 2+ src attributes --- src/data.js | 52 ++++++++++++++++++++++++++++++++++++---------- src/frontmatter.js | 9 +++++--- src/parser.js | 29 +++++++++++++++++++------- src/translator.js | 2 +- 4 files changed, 69 insertions(+), 23 deletions(-) diff --git a/src/data.js b/src/data.js index 2c1d4b6..55db05a 100644 --- a/src/data.js +++ b/src/data.js @@ -5,20 +5,15 @@ class Data { #expression; constructor(obj, expression) { + // xml2js returns leaf nodes as strings, turn those into consistent objects + // I found this to be safer and more efficient than using the explicitCharkey option this.#obj = typeof obj === 'string' ? { _: obj } : obj; + + // this identifies how the object was referenced, helps a ton with debugging this.#expression = expression; } - get value() { - const value = this.#obj._; - if (value === undefined) { - throw new Error(`Could not get value from ${this.#expression}.`); - } - - return value; - } - - #buildExpression(propName, index) { + #buildExpression(propName, index = undefined) { let expression = `${this.#expression}.${propName}`; if (index !== undefined) { expression += `[${index}]`; @@ -27,11 +22,22 @@ class Data { return expression; } + // used by "optional" functions to return undefined instead of throwing an error + #optional(func) { + try { + return func(); + } catch (ex) { + return undefined; + } + } + + // will not throw an error if property doesn't exist, defaults to empty array children(propName) { const nodes = this.#obj[propName] ?? []; return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index))); } + // throws an error if property (or index on property) doesn't exist child(propName, index = 0) { const nodes = this.#obj[propName]; if (nodes === undefined) { @@ -46,10 +52,22 @@ class Data { return new Data(node, this.#buildExpression(propName, index)); } + // convenience function, since it's very common to want the value of a child childValue(propName, index = 0) { - return this.child(propName, index).value; + return this.child(propName, index).value(); + } + + // throws an error if this object doesn't have a value string + value() { + const value = this.#obj._; + if (value === undefined) { + throw new Error(`Could not get value from ${this.#expression}.`); + } + + return value; } + // throws an error if attribute does not exist attribute(attrName) { const attribute = this.#obj.$?.[attrName]; if (attribute === undefined) { @@ -58,6 +76,18 @@ class Data { return attribute; } + + optionalChild(propName, index = 0) { + return this.#optional(() => this.child(propName, index)); + } + + optionalChildValue(propName, index = 0) { + return this.#optional(() => this.childValue(propName, index)); + } + + optionalValue() { + return this.#optional(() => this.value()); + } } export async function load(content) { diff --git a/src/frontmatter.js b/src/frontmatter.js index 74aede8..22f37c9 100644 --- a/src/frontmatter.js +++ b/src/frontmatter.js @@ -1,6 +1,7 @@ export function author(post) { - // not decoded, WordPress doesn't allow funky characters in usernames anyway - return post.data.childValue('creator'); + // not decoded (WordPress doesn't allow funky characters in usernames anyway) + // surprisingly, does not always exist (squarespace exports, for example) + return post.data.optionalChildValue('creator'); } export function categories(post) { @@ -28,7 +29,9 @@ export function draft(post) { export function excerpt(post) { // not decoded, newlines collapsed - return post.data.childValue('encoded', 1).replace(/[\r\n]+/gm, ' '); + // does not always exist (squarespace exports, for example) + const encoded = post.data.optionalChildValue('encoded', 1); + return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined; } export function id(post) { diff --git a/src/parser.js b/src/parser.js index 023b4da..915c97b 100644 --- a/src/parser.js +++ b/src/parser.js @@ -115,7 +115,7 @@ function collectAttachedImages(allPostData) { }) .map(attachment => ({ id: attachment.childValue('post_id'), - postId: attachment.childValue('post_parent'), + postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example) url: attachment.childValue('attachment_url') })); @@ -128,16 +128,25 @@ function collectScrapedImages(allPostData, postTypes) { postTypes.forEach(postType => { getItemsOfType(allPostData, postType).forEach(postData => { const postId = postData.childValue('post_id'); + const postContent = postData.childValue('encoded'); - const postLink = postData.childValue('link'); + const scrapedUrls = [...postContent.matchAll(/]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]); + scrapedUrls.forEach((scrapedUrl) => { + let url; + if (isAbsoluteUrl(scrapedUrl)) { + url = scrapedUrl; + } else { + const postLink = postData.childValue('link'); + if (isAbsoluteUrl(postLink)) { + url = new URL(scrapedUrl, postLink).href; + } else { + throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`); + } + } - const matches = [...postContent.matchAll(/]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)]; - matches.forEach(match => { - // base the matched image URL relative to the post URL - const url = new URL(match[1], postLink).href; images.push({ - id: -1, - postId: postId, + id: 'nope', // scraped images don't have an id + postId, url }); }); @@ -187,3 +196,7 @@ function populateFrontmatter(posts) { }); } +function isAbsoluteUrl(url) { + return (/^https?:\/\//i).test(url); +} + diff --git a/src/translator.js b/src/translator.js index a4f3bb1..a317f4b 100644 --- a/src/translator.js +++ b/src/translator.js @@ -107,7 +107,7 @@ export function getPostContent(content) { if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') { // writeImageFile() will save all content images to a relative /images // folder so update references in post content to match - content = content.replace(/(]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3'); + content = content.replace(/(]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3'); } // preserve "more" separator, max one per post, optionally with custom label From 94c93d045c6d2ca01c6b0c458059ed0a148a7a47 Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Sun, 23 Feb 2025 16:18:40 -0500 Subject: [PATCH 7/9] Fix for awful markdown links with
--- src/translator.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/translator.js b/src/translator.js index a317f4b..2642106 100644 --- a/src/translator.js +++ b/src/translator.js @@ -34,6 +34,14 @@ function initTurndownService() { replacement: (content, node) => '\n\n' + node.outerHTML }); + //
within can cause extra whitespace that wreck markdown links, so this removes them + turndownService.addRule('a', { + filter: 'a', + replacement: (content) => { + return content.replace(/<\/?div[^>]*>/gi, ''); + } + }); + // preserve embedded scripts (for tweets, codepens, gists, etc.) turndownService.addRule('script', { filter: 'script', From 5aef591c3d304392e09f9e38739c4005d5836c55 Mon Sep 17 00:00:00 2001 From: Will Boyd Date: Mon, 24 Feb 2025 16:53:43 -0500 Subject: [PATCH 8/9] Remove style contents, excessive line breaks --- src/translator.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/translator.js b/src/translator.js index 2642106..6c31b81 100644 --- a/src/translator.js +++ b/src/translator.js @@ -14,6 +14,8 @@ function initTurndownService() { turndownService.use(turndownPluginGfm.tables); + turndownService.remove(['style']); //