mirror of
https://github.com/10h30/wordpress-export-to-markdown.git
synced 2026-06-05 15:09:59 +09:00
Data fixes, fix for <img> with 2+ src attributes
This commit is contained in:
+41
-11
@@ -5,20 +5,15 @@ class Data {
|
||||
#expression;
|
||||
|
||||
constructor(obj, expression) {
|
||||
// xml2js returns leaf nodes as strings, turn those into consistent objects
|
||||
// I found this to be safer and more efficient than using the explicitCharkey option
|
||||
this.#obj = typeof obj === 'string' ? { _: obj } : obj;
|
||||
|
||||
// this identifies how the object was referenced, helps a ton with debugging
|
||||
this.#expression = expression;
|
||||
}
|
||||
|
||||
get value() {
|
||||
const value = this.#obj._;
|
||||
if (value === undefined) {
|
||||
throw new Error(`Could not get value from ${this.#expression}.`);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
#buildExpression(propName, index) {
|
||||
#buildExpression(propName, index = undefined) {
|
||||
let expression = `${this.#expression}.${propName}`;
|
||||
if (index !== undefined) {
|
||||
expression += `[${index}]`;
|
||||
@@ -27,11 +22,22 @@ class Data {
|
||||
return expression;
|
||||
}
|
||||
|
||||
// used by "optional" functions to return undefined instead of throwing an error
|
||||
#optional(func) {
|
||||
try {
|
||||
return func();
|
||||
} catch (ex) {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// will not throw an error if property doesn't exist, defaults to empty array
|
||||
children(propName) {
|
||||
const nodes = this.#obj[propName] ?? [];
|
||||
return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index)));
|
||||
}
|
||||
|
||||
// throws an error if property (or index on property) doesn't exist
|
||||
child(propName, index = 0) {
|
||||
const nodes = this.#obj[propName];
|
||||
if (nodes === undefined) {
|
||||
@@ -46,10 +52,22 @@ class Data {
|
||||
return new Data(node, this.#buildExpression(propName, index));
|
||||
}
|
||||
|
||||
// convenience function, since it's very common to want the value of a child
|
||||
childValue(propName, index = 0) {
|
||||
return this.child(propName, index).value;
|
||||
return this.child(propName, index).value();
|
||||
}
|
||||
|
||||
// throws an error if this object doesn't have a value string
|
||||
value() {
|
||||
const value = this.#obj._;
|
||||
if (value === undefined) {
|
||||
throw new Error(`Could not get value from ${this.#expression}.`);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// throws an error if attribute does not exist
|
||||
attribute(attrName) {
|
||||
const attribute = this.#obj.$?.[attrName];
|
||||
if (attribute === undefined) {
|
||||
@@ -58,6 +76,18 @@ class Data {
|
||||
|
||||
return attribute;
|
||||
}
|
||||
|
||||
optionalChild(propName, index = 0) {
|
||||
return this.#optional(() => this.child(propName, index));
|
||||
}
|
||||
|
||||
optionalChildValue(propName, index = 0) {
|
||||
return this.#optional(() => this.childValue(propName, index));
|
||||
}
|
||||
|
||||
optionalValue() {
|
||||
return this.#optional(() => this.value());
|
||||
}
|
||||
}
|
||||
|
||||
export async function load(content) {
|
||||
|
||||
+6
-3
@@ -1,6 +1,7 @@
|
||||
export function author(post) {
|
||||
// not decoded, WordPress doesn't allow funky characters in usernames anyway
|
||||
return post.data.childValue('creator');
|
||||
// not decoded (WordPress doesn't allow funky characters in usernames anyway)
|
||||
// surprisingly, does not always exist (squarespace exports, for example)
|
||||
return post.data.optionalChildValue('creator');
|
||||
}
|
||||
|
||||
export function categories(post) {
|
||||
@@ -28,7 +29,9 @@ export function draft(post) {
|
||||
|
||||
export function excerpt(post) {
|
||||
// not decoded, newlines collapsed
|
||||
return post.data.childValue('encoded', 1).replace(/[\r\n]+/gm, ' ');
|
||||
// does not always exist (squarespace exports, for example)
|
||||
const encoded = post.data.optionalChildValue('encoded', 1);
|
||||
return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined;
|
||||
}
|
||||
|
||||
export function id(post) {
|
||||
|
||||
+21
-8
@@ -115,7 +115,7 @@ function collectAttachedImages(allPostData) {
|
||||
})
|
||||
.map(attachment => ({
|
||||
id: attachment.childValue('post_id'),
|
||||
postId: attachment.childValue('post_parent'),
|
||||
postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example)
|
||||
url: attachment.childValue('attachment_url')
|
||||
}));
|
||||
|
||||
@@ -128,16 +128,25 @@ function collectScrapedImages(allPostData, postTypes) {
|
||||
postTypes.forEach(postType => {
|
||||
getItemsOfType(allPostData, postType).forEach(postData => {
|
||||
const postId = postData.childValue('post_id');
|
||||
|
||||
const postContent = postData.childValue('encoded');
|
||||
const postLink = postData.childValue('link');
|
||||
const scrapedUrls = [...postContent.matchAll(/<img\s[^>]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]);
|
||||
scrapedUrls.forEach((scrapedUrl) => {
|
||||
let url;
|
||||
if (isAbsoluteUrl(scrapedUrl)) {
|
||||
url = scrapedUrl;
|
||||
} else {
|
||||
const postLink = postData.childValue('link');
|
||||
if (isAbsoluteUrl(postLink)) {
|
||||
url = new URL(scrapedUrl, postLink).href;
|
||||
} else {
|
||||
throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`);
|
||||
}
|
||||
}
|
||||
|
||||
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)];
|
||||
matches.forEach(match => {
|
||||
// base the matched image URL relative to the post URL
|
||||
const url = new URL(match[1], postLink).href;
|
||||
images.push({
|
||||
id: -1,
|
||||
postId: postId,
|
||||
id: 'nope', // scraped images don't have an id
|
||||
postId,
|
||||
url
|
||||
});
|
||||
});
|
||||
@@ -187,3 +196,7 @@ function populateFrontmatter(posts) {
|
||||
});
|
||||
}
|
||||
|
||||
function isAbsoluteUrl(url) {
|
||||
return (/^https?:\/\//i).test(url);
|
||||
}
|
||||
|
||||
|
||||
+1
-1
@@ -107,7 +107,7 @@ export function getPostContent(content) {
|
||||
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
|
||||
// writeImageFile() will save all content images to a relative /images
|
||||
// folder so update references in post content to match
|
||||
content = content.replace(/(<img[^>]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
|
||||
content = content.replace(/(<img\s[^>]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
|
||||
}
|
||||
|
||||
// preserve "more" separator, max one per post, optionally with custom label
|
||||
|
||||
Reference in New Issue
Block a user