mirror of
https://github.com/10h30/wordpress-export-to-markdown.git
synced 2026-06-05 15:09:59 +09:00
Merge pull request #149 from lonekorean/parse-debugging
Parse debugging
This commit is contained in:
+108
@@ -0,0 +1,108 @@
|
||||
import xml2js from 'xml2js';
|
||||
|
||||
class Data {
|
||||
#obj;
|
||||
#expression;
|
||||
|
||||
constructor(obj, expression) {
|
||||
// xml2js returns leaf nodes as strings, turn those into consistent objects
|
||||
// I found this to be safer and more efficient than using the explicitCharkey option
|
||||
this.#obj = typeof obj === 'string' ? { _: obj } : obj;
|
||||
|
||||
// this identifies how the object was referenced, helps a ton with debugging
|
||||
this.#expression = expression;
|
||||
}
|
||||
|
||||
#buildExpression(propName, index = undefined) {
|
||||
let expression = `${this.#expression}.${propName}`;
|
||||
if (index !== undefined) {
|
||||
expression += `[${index}]`;
|
||||
}
|
||||
|
||||
return expression;
|
||||
}
|
||||
|
||||
// used by "optional" functions to return undefined instead of throwing an error
|
||||
#optional(func) {
|
||||
try {
|
||||
return func();
|
||||
} catch (ex) {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// will not throw an error if property doesn't exist, defaults to empty array
|
||||
children(propName) {
|
||||
const nodes = this.#obj[propName] ?? [];
|
||||
return nodes.map((value, index) => new Data(value, this.#buildExpression(propName, index)));
|
||||
}
|
||||
|
||||
// throws an error if property (or index on property) doesn't exist
|
||||
child(propName, index = 0) {
|
||||
const nodes = this.#obj[propName];
|
||||
if (nodes === undefined) {
|
||||
throw new Error(`Could not find ${this.#buildExpression(propName)}.`);
|
||||
}
|
||||
|
||||
const node = nodes[index];
|
||||
if (node === undefined) {
|
||||
throw new Error(`Could not find ${this.#buildExpression(propName, index)}.`);
|
||||
}
|
||||
|
||||
return new Data(node, this.#buildExpression(propName, index));
|
||||
}
|
||||
|
||||
// convenience function, since it's very common to want the value of a child
|
||||
childValue(propName, index = 0) {
|
||||
return this.child(propName, index).value();
|
||||
}
|
||||
|
||||
// throws an error if this object doesn't have a value string
|
||||
value() {
|
||||
const value = this.#obj._;
|
||||
if (value === undefined) {
|
||||
throw new Error(`Could not get value from ${this.#expression}.`);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// throws an error if attribute does not exist
|
||||
attribute(attrName) {
|
||||
const attribute = this.#obj.$?.[attrName];
|
||||
if (attribute === undefined) {
|
||||
throw new Error(`Could not get attribute ${attrName} from ${this.#expression}.`);
|
||||
}
|
||||
|
||||
return attribute;
|
||||
}
|
||||
|
||||
optionalChild(propName, index = 0) {
|
||||
return this.#optional(() => this.child(propName, index));
|
||||
}
|
||||
|
||||
optionalChildValue(propName, index = 0) {
|
||||
return this.#optional(() => this.childValue(propName, index));
|
||||
}
|
||||
|
||||
optionalValue() {
|
||||
return this.#optional(() => this.value());
|
||||
}
|
||||
}
|
||||
|
||||
export async function load(content) {
|
||||
const rootData = await xml2js.parseStringPromise(content, {
|
||||
tagNameProcessors: [xml2js.processors.stripPrefix],
|
||||
trim: true
|
||||
}).catch((ex) => {
|
||||
ex.message = 'Could not parse XML. This likely means your import file is malformed.\n\n' + ex.message;
|
||||
throw ex;
|
||||
});
|
||||
|
||||
const rssData = rootData.rss;
|
||||
if (rssData === undefined) {
|
||||
throw new Error('Could not find <rss> root node. This likely means your import file is malformed.')
|
||||
}
|
||||
|
||||
return new Data(rssData, 'rss');
|
||||
}
|
||||
+25
-36
@@ -1,74 +1,63 @@
|
||||
// get author, without decoding
|
||||
// WordPress doesn't allow funky characters in usernames anyway
|
||||
export function author(post) {
|
||||
return post.data.creator[0];
|
||||
// not decoded (WordPress doesn't allow funky characters in usernames anyway)
|
||||
// surprisingly, does not always exist (squarespace exports, for example)
|
||||
return post.data.optionalChildValue('creator');
|
||||
}
|
||||
|
||||
// get array of decoded category names, excluding 'uncategorized'
|
||||
export function categories(post) {
|
||||
if (!post.data.category) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const categories = post.data.category
|
||||
.filter(category => category.$.domain === 'category')
|
||||
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
|
||||
|
||||
return categories.filter((category) => category !== 'uncategorized');
|
||||
// array of decoded category names, excluding 'uncategorized'
|
||||
const categories = post.data.children('category');
|
||||
return categories
|
||||
.filter((category) => category.attribute('domain') === 'category' && category.attribute('nicename') !== 'uncategorized')
|
||||
.map((category) => decodeURIComponent(category.attribute('nicename')));
|
||||
}
|
||||
|
||||
// get cover image filename, previously decoded and set on post
|
||||
// this one is unique as it relies on special logic executed by the parser
|
||||
export function coverImage(post) {
|
||||
// cover image filename, previously parsed and decoded
|
||||
return post.coverImage;
|
||||
}
|
||||
|
||||
// get post date, previously saved as a luxon datetime object on post
|
||||
export function date(post) {
|
||||
// a luxon datetime object, previously parsed
|
||||
return post.date;
|
||||
}
|
||||
|
||||
// get boolean indicating if post is a draft
|
||||
// this will only be included if true, otherwise it's left off
|
||||
export function draft(post) {
|
||||
// boolean representing the previously parsed draft status, only included when true
|
||||
return post.isDraft ? true : undefined;
|
||||
}
|
||||
|
||||
// get excerpt, not decoded, newlines collapsed
|
||||
export function excerpt(post) {
|
||||
return post.data.encoded[1].replace(/[\r\n]+/gm, ' ');
|
||||
// not decoded, newlines collapsed
|
||||
// does not always exist (squarespace exports, for example)
|
||||
const encoded = post.data.optionalChildValue('encoded', 1);
|
||||
return encoded ? encoded.replace(/[\r\n]+/gm, ' ') : undefined;
|
||||
}
|
||||
|
||||
// get ID, as an integer
|
||||
export function id(post) {
|
||||
// previously parsed as a string, converted to integer here
|
||||
return parseInt(post.id);
|
||||
}
|
||||
|
||||
// get slug, previously decoded and set on post
|
||||
export function slug(post) {
|
||||
// previously parsed and decoded
|
||||
return post.slug;
|
||||
}
|
||||
|
||||
// get array of decoded tag names
|
||||
export function tags(post) {
|
||||
if (!post.data.category) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const categories = post.data.category
|
||||
.filter(category => category.$.domain === 'post_tag')
|
||||
.map(({ $: attributes }) => decodeURIComponent(attributes.nicename));
|
||||
|
||||
return categories;
|
||||
// array of decoded tag names (yes, they come from <category> nodes, not a typo)
|
||||
const categories = post.data.children('category');
|
||||
return categories
|
||||
.filter((category) => category.attribute('domain') === 'post_tag')
|
||||
.map((category) => decodeURIComponent(category.attribute('nicename')));
|
||||
}
|
||||
|
||||
// get simple post title, but not decoded like other frontmatter string fields
|
||||
export function title(post) {
|
||||
return post.data.title[0];
|
||||
// not decoded
|
||||
return post.data.childValue('title');
|
||||
}
|
||||
|
||||
// get type, often this will always be "post"
|
||||
// but can also be "page" or other custom types
|
||||
export function type(post) {
|
||||
// previously parsed but not decoded, can be "post", "page", or other custom types
|
||||
return post.type;
|
||||
}
|
||||
|
||||
+63
-49
@@ -1,6 +1,6 @@
|
||||
import fs from 'fs';
|
||||
import * as luxon from 'luxon';
|
||||
import xml2js from 'xml2js';
|
||||
import * as data from './data.js';
|
||||
import * as frontmatter from './frontmatter.js';
|
||||
import * as shared from './shared.js';
|
||||
import * as translator from './translator.js';
|
||||
@@ -8,21 +8,18 @@ import * as translator from './translator.js';
|
||||
export async function parseFilePromise() {
|
||||
console.log('\nParsing...');
|
||||
const content = await fs.promises.readFile(shared.config.input, 'utf8');
|
||||
const allData = await xml2js.parseStringPromise(content, {
|
||||
trim: true,
|
||||
tagNameProcessors: [xml2js.processors.stripPrefix]
|
||||
});
|
||||
const channelData = allData.rss.channel[0].item;
|
||||
const rssData = await data.load(content);
|
||||
const allPostData = rssData.child('channel').children('item');
|
||||
|
||||
const postTypes = getPostTypes(channelData);
|
||||
const posts = collectPosts(channelData, postTypes);
|
||||
const postTypes = getPostTypes(allPostData);
|
||||
const posts = collectPosts(allPostData, postTypes);
|
||||
|
||||
const images = [];
|
||||
if (shared.config.saveImages === 'attached' || shared.config.saveImages === 'all') {
|
||||
images.push(...collectAttachedImages(channelData));
|
||||
images.push(...collectAttachedImages(allPostData));
|
||||
}
|
||||
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
|
||||
images.push(...collectScrapedImages(channelData, postTypes));
|
||||
images.push(...collectScrapedImages(allPostData, postTypes));
|
||||
}
|
||||
|
||||
mergeImagesIntoPosts(images, posts);
|
||||
@@ -31,11 +28,11 @@ export async function parseFilePromise() {
|
||||
return posts;
|
||||
}
|
||||
|
||||
function getPostTypes(channelData) {
|
||||
function getPostTypes(allPostData) {
|
||||
// search export file for all post types minus some specific types we don't want
|
||||
const types = channelData
|
||||
.map(item => item.post_type[0])
|
||||
.filter(type => ![
|
||||
const postTypes = allPostData
|
||||
.map((postData) => postData.childValue('post_type'))
|
||||
.filter((postType) => ![
|
||||
'attachment',
|
||||
'revision',
|
||||
'nav_menu_item',
|
||||
@@ -48,20 +45,20 @@ function getPostTypes(channelData) {
|
||||
'wp_navigation',
|
||||
'wp_template',
|
||||
'wp_template_part'
|
||||
].includes(type));
|
||||
return [...new Set(types)]; // remove duplicates
|
||||
].includes(postType));
|
||||
return [...new Set(postTypes)]; // remove duplicates
|
||||
}
|
||||
|
||||
function getItemsOfType(channelData, type) {
|
||||
return channelData.filter(item => item.post_type[0] === type);
|
||||
function getItemsOfType(allPostData, type) {
|
||||
return allPostData.filter(item => item.childValue('post_type') === type);
|
||||
}
|
||||
|
||||
function collectPosts(channelData, postTypes) {
|
||||
function collectPosts(allPostData, postTypes) {
|
||||
let allPosts = [];
|
||||
postTypes.forEach(postType => {
|
||||
const postsForType = getItemsOfType(channelData, postType)
|
||||
.filter(postData => postData.status[0] !== 'trash')
|
||||
.filter(postData => !(postType === 'page' && postData.post_name[0] === 'sample-page'))
|
||||
const postsForType = getItemsOfType(allPostData, postType)
|
||||
.filter(postData => postData.childValue('status') !== 'trash')
|
||||
.filter(postData => !(postType === 'page' && postData.childValue('post_name') === 'sample-page'))
|
||||
.map(postData => buildPost(postData));
|
||||
|
||||
if (postsForType.length > 0) {
|
||||
@@ -80,15 +77,15 @@ function buildPost(data) {
|
||||
data,
|
||||
|
||||
// body content converted to markdown
|
||||
content: translator.getPostContent(data.encoded[0]),
|
||||
content: translator.getPostContent(data.childValue('encoded')),
|
||||
|
||||
// particularly useful values for all sorts of things
|
||||
type: data.post_type[0],
|
||||
id: data.post_id[0],
|
||||
isDraft: data.status[0] === 'draft',
|
||||
slug: decodeURIComponent(data.post_name[0]),
|
||||
type: data.childValue('post_type'),
|
||||
id: data.childValue('post_id'),
|
||||
isDraft: data.childValue('status') === 'draft',
|
||||
slug: decodeURIComponent(data.childValue('post_name')),
|
||||
date: getPostDate(data),
|
||||
coverImageId: getPostMetaValue(data.postmeta, '_thumbnail_id'),
|
||||
coverImageId: getPostMetaValue(data, '_thumbnail_id'),
|
||||
|
||||
// these are possibly set later in mergeImagesIntoPosts()
|
||||
coverImage: undefined,
|
||||
@@ -97,44 +94,57 @@ function buildPost(data) {
|
||||
}
|
||||
|
||||
function getPostDate(data) {
|
||||
const date = luxon.DateTime.fromRFC2822(data.pubDate[0] ?? '', { zone: shared.config.customDateTimezone });
|
||||
const date = luxon.DateTime.fromRFC2822(data.childValue('pubDate'), { zone: shared.config.customDateTimezone });
|
||||
return date.isValid ? date : undefined;
|
||||
}
|
||||
|
||||
function getPostMetaValue(metas, key) {
|
||||
const meta = metas && metas.find((meta) => meta.meta_key[0] === key);
|
||||
return meta ? meta.meta_value[0] : undefined;
|
||||
function getPostMetaValue(data, key) {
|
||||
const metas = data.children('postmeta');
|
||||
const meta = metas.find((meta) => meta.childValue('meta_key') === key);
|
||||
return meta ? meta.childValue('meta_value') : undefined;
|
||||
}
|
||||
|
||||
function collectAttachedImages(channelData) {
|
||||
const images = getItemsOfType(channelData, 'attachment')
|
||||
function collectAttachedImages(allPostData) {
|
||||
const images = getItemsOfType(allPostData, 'attachment')
|
||||
// filter to certain image file types
|
||||
.filter(attachment => attachment.attachment_url && (/\.(gif|jpe?g|png|webp)$/i).test(attachment.attachment_url[0]))
|
||||
.filter(attachment => {
|
||||
const url = attachment.childValue('attachment_url');
|
||||
return url && (/\.(gif|jpe?g|png|webp)$/i).test(url);
|
||||
})
|
||||
.map(attachment => ({
|
||||
id: attachment.post_id[0],
|
||||
postId: attachment.post_parent[0],
|
||||
url: attachment.attachment_url[0]
|
||||
id: attachment.childValue('post_id'),
|
||||
postId: attachment.optionalChildValue('post_parent') ?? 'nope', // may not exist (cover image in a squarespace export, for example)
|
||||
url: attachment.childValue('attachment_url')
|
||||
}));
|
||||
|
||||
console.log(images.length + ' attached images found.');
|
||||
return images;
|
||||
}
|
||||
|
||||
function collectScrapedImages(channelData, postTypes) {
|
||||
function collectScrapedImages(allPostData, postTypes) {
|
||||
const images = [];
|
||||
postTypes.forEach(postType => {
|
||||
getItemsOfType(channelData, postType).forEach(postData => {
|
||||
const postId = postData.post_id[0];
|
||||
const postContent = postData.encoded[0];
|
||||
const postLink = postData.link[0];
|
||||
getItemsOfType(allPostData, postType).forEach(postData => {
|
||||
const postId = postData.childValue('post_id');
|
||||
|
||||
const postContent = postData.childValue('encoded');
|
||||
const scrapedUrls = [...postContent.matchAll(/<img\s[^>]*?src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)].map((match) => match[1]);
|
||||
scrapedUrls.forEach((scrapedUrl) => {
|
||||
let url;
|
||||
if (isAbsoluteUrl(scrapedUrl)) {
|
||||
url = scrapedUrl;
|
||||
} else {
|
||||
const postLink = postData.childValue('link');
|
||||
if (isAbsoluteUrl(postLink)) {
|
||||
url = new URL(scrapedUrl, postLink).href;
|
||||
} else {
|
||||
throw new Error(`Unable to determine absolute URL from scraped image URL '${scrapedUrl}' and post link URL '${postLink}'.`);
|
||||
}
|
||||
}
|
||||
|
||||
const matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png|webp))"[^>]*>/gi)];
|
||||
matches.forEach(match => {
|
||||
// base the matched image URL relative to the post URL
|
||||
const url = new URL(match[1], postLink).href;
|
||||
images.push({
|
||||
id: -1,
|
||||
postId: postId,
|
||||
id: 'nope', // scraped images don't have an id
|
||||
postId,
|
||||
url
|
||||
});
|
||||
});
|
||||
@@ -184,3 +194,7 @@ function populateFrontmatter(posts) {
|
||||
});
|
||||
}
|
||||
|
||||
function isAbsoluteUrl(url) {
|
||||
return (/^https?:\/\//i).test(url);
|
||||
}
|
||||
|
||||
|
||||
+1
-1
@@ -119,7 +119,7 @@ export function load() {
|
||||
{
|
||||
name: 'markdown-file-write-delay',
|
||||
type: 'integer',
|
||||
default: 25
|
||||
default: 10
|
||||
},
|
||||
{
|
||||
name: 'include-time-with-date',
|
||||
|
||||
+14
-1
@@ -14,6 +14,8 @@ function initTurndownService() {
|
||||
|
||||
turndownService.use(turndownPluginGfm.tables);
|
||||
|
||||
turndownService.remove(['style']); // <style> contents get dumped as plain text, would rather remove
|
||||
|
||||
// preserve embedded tweets
|
||||
turndownService.addRule('tweet', {
|
||||
filter: node => node.nodeName === 'BLOCKQUOTE' && node.getAttribute('class') === 'twitter-tweet',
|
||||
@@ -34,6 +36,14 @@ function initTurndownService() {
|
||||
replacement: (content, node) => '\n\n' + node.outerHTML
|
||||
});
|
||||
|
||||
// <div> within <a> can cause extra whitespace that wreck markdown links, so this removes them
|
||||
turndownService.addRule('a', {
|
||||
filter: 'a',
|
||||
replacement: (content) => {
|
||||
return content.replace(/<\/?div[^>]*>/gi, '');
|
||||
}
|
||||
});
|
||||
|
||||
// preserve embedded scripts (for tweets, codepens, gists, etc.)
|
||||
turndownService.addRule('script', {
|
||||
filter: 'script',
|
||||
@@ -107,7 +117,7 @@ export function getPostContent(content) {
|
||||
if (shared.config.saveImages === 'scraped' || shared.config.saveImages === 'all') {
|
||||
// writeImageFile() will save all content images to a relative /images
|
||||
// folder so update references in post content to match
|
||||
content = content.replace(/(<img[^>]*src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
|
||||
content = content.replace(/(<img\s[^>]*?src=").*?([^/"]+\.(?:gif|jpe?g|png|webp))("[^>]*>)/gi, '$1images/$2$3');
|
||||
}
|
||||
|
||||
// preserve "more" separator, max one per post, optionally with custom label
|
||||
@@ -124,5 +134,8 @@ export function getPostContent(content) {
|
||||
// clean up extra spaces in list items
|
||||
content = content.replace(/(-|\d+\.) +/g, '$1 ');
|
||||
|
||||
// collapse excessive newlines (can happen with a lot of <div>)
|
||||
content = content.replace(/(\r?\n){3,}/g, '\n\n');
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user