mirror of
https://github.com/10h30/wordpress-export-to-markdown.git
synced 2026-06-05 15:09:59 +09:00
Some console logs for stats
This commit is contained in:
+45
-41
@@ -13,6 +13,8 @@ async function parseFilePromise(config) {
|
||||
tagNameProcessors: [xml2js.processors.stripPrefix]
|
||||
});
|
||||
|
||||
let posts = collectPosts(data);
|
||||
|
||||
let images = [];
|
||||
if (config.saveattachedimages) {
|
||||
images.push(...collectAttachedImages(data));
|
||||
@@ -21,55 +23,20 @@ async function parseFilePromise(config) {
|
||||
images.push(...collectScrapedImages(data));
|
||||
}
|
||||
|
||||
let posts = collectPosts(data);
|
||||
mergeImagesIntoPosts(images, posts);
|
||||
|
||||
return Promise.resolve(posts);
|
||||
}
|
||||
|
||||
function collectAttachedImages(data) {
|
||||
let images = getItemsOfType(data, 'attachment')
|
||||
// filter to certain image file types
|
||||
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
|
||||
.map(attachment => ({
|
||||
id: attachment.post_id[0],
|
||||
postId: attachment.post_parent[0],
|
||||
url: attachment.attachment_url[0]
|
||||
}));
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
function collectScrapedImages(data) {
|
||||
let images = [];
|
||||
|
||||
getItemsOfType(data, 'post').forEach(post => {
|
||||
let postId = post.post_id[0];
|
||||
let postContent = post.encoded[0];
|
||||
let postLink = post.link[0];
|
||||
|
||||
let matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
|
||||
matches.forEach(match => {
|
||||
// base the matched image URL relative to the post URL
|
||||
let url = new URL(match[1], postLink).href;
|
||||
|
||||
images.push({
|
||||
id: -1,
|
||||
postId: postId,
|
||||
url: url
|
||||
});
|
||||
console.log('Scraped ' + url + '.');
|
||||
});
|
||||
});
|
||||
|
||||
return images;
|
||||
function getItemsOfType(data, type) {
|
||||
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
|
||||
}
|
||||
|
||||
function collectPosts(data) {
|
||||
// this is passed into getPostContent() for the markdown conversion
|
||||
turndownService = translator.initTurndownService();
|
||||
|
||||
return getItemsOfType(data, 'post')
|
||||
let posts = getItemsOfType(data, 'post')
|
||||
.map(post => ({
|
||||
// meta data isn't written to file, but is used to help with other things
|
||||
meta: {
|
||||
@@ -84,10 +51,9 @@ function collectPosts(data) {
|
||||
},
|
||||
content: translator.getPostContent(post, turndownService, config)
|
||||
}));
|
||||
}
|
||||
|
||||
function getItemsOfType(data, type) {
|
||||
return data.rss.channel[0].item.filter(item => item.post_type[0] === type);
|
||||
console.log(posts.length + ' posts found.');
|
||||
return posts;
|
||||
}
|
||||
|
||||
function getPostId(post) {
|
||||
@@ -113,6 +79,44 @@ function getPostDate(post) {
|
||||
return luxon.DateTime.fromRFC2822(post.pubDate[0], { zone: 'utc' }).toISODate();
|
||||
}
|
||||
|
||||
function collectAttachedImages(data) {
|
||||
let images = getItemsOfType(data, 'attachment')
|
||||
// filter to certain image file types
|
||||
.filter(attachment => (/\.(gif|jpe?g|png)$/i).test(attachment.attachment_url[0]))
|
||||
.map(attachment => ({
|
||||
id: attachment.post_id[0],
|
||||
postId: attachment.post_parent[0],
|
||||
url: attachment.attachment_url[0]
|
||||
}));
|
||||
|
||||
console.log(images.length + ' attached images found.');
|
||||
return images;
|
||||
}
|
||||
|
||||
function collectScrapedImages(data) {
|
||||
let images = [];
|
||||
getItemsOfType(data, 'post').forEach(post => {
|
||||
let postId = post.post_id[0];
|
||||
let postContent = post.encoded[0];
|
||||
let postLink = post.link[0];
|
||||
|
||||
let matches = [...postContent.matchAll(/<img[^>]*src="(.+?\.(?:gif|jpe?g|png))"[^>]*>/gi)];
|
||||
matches.forEach(match => {
|
||||
// base the matched image URL relative to the post URL
|
||||
let url = new URL(match[1], postLink).href;
|
||||
|
||||
images.push({
|
||||
id: -1,
|
||||
postId: postId,
|
||||
url: url
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
console.log(images.length + ' images scraped from post body content.');
|
||||
return images;
|
||||
}
|
||||
|
||||
function mergeImagesIntoPosts(images, posts) {
|
||||
// create lookup table for quicker traversal
|
||||
let postsLookup = posts.reduce((lookup, post) => {
|
||||
|
||||
+1
-4
@@ -1,7 +1,5 @@
|
||||
const turndown = require('turndown');
|
||||
|
||||
const shared = require('./shared');
|
||||
|
||||
function initTurndownService() {
|
||||
let turndownService = new turndown({
|
||||
headingStyle: 'atx',
|
||||
@@ -47,8 +45,7 @@ function initTurndownService() {
|
||||
turndownService.addRule('iframe', {
|
||||
filter: 'iframe',
|
||||
replacement: (content, node) => {
|
||||
let html = node.outerHTML
|
||||
.replace('allowfullscreen=""', 'allowfullscreen');
|
||||
let html = node.outerHTML.replace('allowfullscreen=""', 'allowfullscreen');
|
||||
return '\n\n' + html + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user