const larkTokenFetcher = require('./larkTokenFetcher.js')
const Downloader = require('./larkImageDownloader.js')
const slugify = require('slugify')
const fs = require('node:fs')
const { URL } = require('node:url')
const fetch = require('node-fetch')
const node_path = require('node:path')
const cheerio = require('cheerio')
const showdown = require('showdown')
const _ = require('lodash')
// MDX compilation will be loaded dynamically as it's an ES module
const IMAGE_BED_URL = process.env.IMAGE_BED_URL || 'https://zdoc-images.s3.us-west-2.amazonaws.com'
class larkDocWriter {
constructor(
root_token,
base_token,
displayedSidebar,
docSourceDir='plugins/lark-docs/meta/sources',
imageDir='static/img',
targets='zilliz.saas',
skip_image_download=false,
upload_to_s3=false
) {
this.root_token = root_token
this.base_token = base_token
this.displayedSidebar = displayedSidebar
this.docSourceDir = docSourceDir
this.page_blocks = []
this.blocks = []
this.targets = targets
this.skip_image_download = skip_image_download
this.imageDir = imageDir
this.iframes = []
this.block_types = this.__block_types()
this.code_langs = this.__code_langs()
this.tokenFetcher = new larkTokenFetcher()
this.downloader = new Downloader({}, imageDir)
this.upload_to_s3 = upload_to_s3
}
__fetch_doc_source (type, value, slug="") {
const file = fs.readdirSync(this.docSourceDir).filter(file => {
const page = JSON.parse(fs.readFileSync(`${this.docSourceDir}/${file}`, {encoding: 'utf-8', flag: 'r'}))
try {
type = type instanceof Array ? type.filter(t => Object.keys(page).includes(t))[0] : type
} catch (error) {
throw new Error(`1. Cannot find ${type} in ${this.docSourceDir}/${file}`)
}
return page[type] === value
})
if (file.length > 0) {
if (slug) {
return file.map(file => {
return JSON.parse(fs.readFileSync(`${this.docSourceDir}/${file}`, {encoding: 'utf-8', flag: 'r'}))
}).filter(page => page.slug === slug)[0]
} else {
return JSON.parse(fs.readFileSync(`${this.docSourceDir}/${file[0]}`, {encoding: 'utf-8', flag: 'r'}))
}
} else {
throw new Error(`2. Cannot find ${value} in ${this.docSourceDir}`)
}
}
async write_docs(path, token) {
const forEachAsync = async (array, callback) => {
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, array);
}
}
var current_path = path
const node = this.__fetch_doc_source('node_token', token)
if (node.has_child) {
const children = node.children.filter(child => child.obj_type != 'bitable' && child != undefined)
await forEachAsync(children, async (child, index) => {
if (child.has_child) {
const meta = await this.__is_to_publish(child.title, child.slug)
if (meta['publish']) {
const token = child.node_token
const type = child.node_type
const slug = child.slug
const beta = meta['beta']
const notebook = meta['notebook']
const addedSince = meta['addSince']
const lastModified = meta['lastModified']
const deprecateSince = meta['deprecateSince']
const labels = meta['labels']
const keywords = meta['keywords']
console.log(`${current_path}/${slug}/${slug}.md`)
if (!fs.existsSync(`${current_path}/${slug}`)) {
fs.mkdirSync(`${current_path}/${slug}`)
}
await this.write_doc({
path: `${current_path}/${slug}`,
page_title: child.title,
page_slug: slug,
page_beta: beta,
notebook: notebook,
addedSince: addedSince,
lastModified: lastModified,
deprecateSince: deprecateSince,
page_type: type,
page_token: child.node_token,
sidebar_position: index+1,
sidebar_label: labels,
keywords: keywords,
doc_card_list: true,
})
await this.write_docs(`${current_path}/${slug}`, token)
}
} else {
const meta = await this.__is_to_publish(child.title, child.slug)
switch (child.slug) {
case 'faqs':
if (meta['publish']) {
if (!fs.existsSync(`${current_path}/faqs`)) {
fs.mkdirSync(`${current_path}/faqs`)
}
// this.__category_meta(`${current_path}/faqs`, 'FAQs', index+1)
await this.write_faqs(`${current_path}/faqs`)
}
break;
default:
if (meta['publish']) {
const token = child.node_token
const type = child.node_type
const slug = child.slug
const beta = meta['beta']
const notebook = meta['notebook']
const addedSince = meta['addSince']
const lastModified = meta['lastModified']
const deprecateSince = meta['deprecateSince']
const labels = meta['labels']
const keywords = meta['keywords']
console.log(`${current_path}/${slug}.md`)
await this.write_doc({
path: current_path,
page_title: child.title,
page_slug: child.slug,
page_beta: beta,
notebook: notebook,
addedSince: addedSince,
lastModified: lastModified,
deprecateSince: deprecateSince,
page_type: type,
page_token: token,
sidebar_position: index+1,
sidebar_label: labels,
keywords: keywords,
doc_card_list: false,
})
}
break;
}
}
})
}
}
async write_doc ({
path,
page_title,
page_slug,
page_beta,
notebook,
addedSince,
lastModified,
deprecateSince,
page_type,
page_token,
sidebar_position,
sidebar_label,
keywords,
doc_card_list
}) {
let obj;
let blocks;
if (page_token) {
obj = this.__fetch_doc_source('node_token', page_token, page_slug)
if (obj) {
blocks = obj.blocks.items
}
} else if (page_title) {
obj = this.__fetch_doc_source('title', page_title, page_slug)
if (obj) {
blocks = obj.blocks.items
}
}
if (blocks) {
this.page_blocks = blocks
}
let page = this.page_blocks.filter(block => block.block_type == 1)[0]
if (page && page.children) {
this.blocks = page.children.map(child => {
return this.__retrieve_block_by_id(child)
})
await this.__write_page({
title: page_title,
suffix: this.__title_suffix(path),
slug: page_slug,
beta: page_beta,
notebook: notebook,
addedSince: addedSince,
lastModified: lastModified,
deprecateSince: deprecateSince,
path: path,
type: page_type,
token: page_token,
sidebar_position: sidebar_position,
sidebar_label: sidebar_label,
keywords: keywords,
doc_card_list: doc_card_list,
})
}
}
__title_suffix(path) {
var suffix = 'Cloud'
if (path.includes('byoc')) {
suffix = 'BYOC'
} else if (path.includes('go/v1')) {
suffix = 'Go | v1'
} else if (path.includes('go/v2')) {
suffix = 'Go | v2'
} else if (path.includes('go')) {
suffix = 'Go'
} else if (path.includes('python/MilvusClient')) {
suffix = 'Python | MilvusClient'
} else if (path.includes('python/ORM')) {
suffix = 'Python | ORM'
} else if (path.includes('python')) {
suffix = 'Python'
} else if (path.includes('java/v1')) {
suffix = 'Java | v1'
} else if (path.includes('java/v2')) {
suffix = 'Java | v2'
} else if (path.includes('java')) {
suffix = 'Java'
} else if (path.includes('nodejs')) {
suffix = 'Node.js'
} else if (path.includes('restful/control-plane/v1')) {
suffix = 'RESTful | Control Plane | v1'
} else if (path.includes('restful/control-plane/v2')) {
suffix = 'RESTful | Control Plane | v2'
} else if (path.includes('restful/control-plane')) {
suffix = 'RESTful | Control Plane'
} else if (path.includes('restful/data-plane/v1')) {
suffix = 'RESTful | Data Plane | v1'
} else if (path.includes('restful/data-plane/v2')) {
suffix = 'RESTful | Data Plane | v2'
} else if (path.includes('restful/data-plane')) {
suffix = 'RESTful | Data Plane'
} else if (path.includes('restful')) {
suffix = 'RESTful'
}
return suffix
}
async write_faqs (path) {
const source = this.__fetch_doc_source('slug', 'faqs')
const title = source.title
const blocks = source.blocks.items
const suffix = path.includes('byoc') ? 'BYOC' : 'CLOUD'
if (blocks) {
this.page_blocks = blocks
}
let page = this.page_blocks.filter(block => block.block_type == 1)[0]
if (page && page.children) {
this.blocks = page.children.map(child => {
return this.__retrieve_block_by_id(child)
})
let a = await this.__markdown()
a = this.__filter_content(a, this.targets).split('\n')
let header_pos = a.map((line, index) => {
if (line.startsWith('##')) {
return index
}
}).filter(x => x !== undefined)
let sub_pages = []
for (let i = 0; i < header_pos.length; i++) {
let start = header_pos[i]
let end = header_pos[i+1]
let sub_page = a.slice(start, end)
sub_pages.push(sub_page)
}
// Write FAQs root page
let slug = 'faqs'
let front_matter = this.__front_matters(title, suffix, slug, null, null, source.node_type, source.node_token, 999, "", "", this.displayedSidebar, "Frequently asked questions")
const markdown = `${front_matter}\n\n# ${title}` + "\n\nimport DocCardList from '@theme/DocCardList';\n\n
/g, '
')
.replace(/(
){2,}/, "
")
.replace("
<\/p><\/td>\n)*\s*<\/tr>/g, '');
}
__match_filter_tags(markdown) {
const startTagRegex = /<(include|exclude) target="(.+?)"/gmi
const endTagRegex = /<\/(include|exclude)>/gmi
const matches = [... markdown.matchAll(startTagRegex)]
var returns = []
matches.forEach(match => {
var tag = match[1].toLowerCase()
var target = match[2].trim()
var rest = markdown.slice(match.index)
var closeTagRegex = new RegExp(`${tag}>`, 'gmi')
var closeTagMatch = [... rest.matchAll(closeTagRegex)]
var startIndex = match.index
var endIndex = -1
for (let i = 0; i < closeTagMatch.length; i++) {
var t = markdown.slice(startIndex, startIndex+closeTagMatch[i].index+closeTagMatch[i][0].length)
var startCount = t.match(startTagRegex) ? t.match(startTagRegex).length : 0
var endCount = t.match(endTagRegex) ? t.match(endTagRegex).length : 0
if (startCount === endCount) {
endIndex = startIndex + closeTagMatch[i].index + closeTagMatch[i][0].length
break
}
}
if (endIndex === -1) console.warn(`No matching end tag for ${tag} tag at index ${match.index}`)
returns.push({
tag: tag,
target: target,
startIndex: startIndex,
endIndex: endIndex
})
})
return returns
}
__extract_description(markdown) {
const content = markdown.split('\n')
const title = content.filter(line => line.startsWith('# '))
var description = "(placeholder)"
if (title.length > 0) {
description = content[content.indexOf(title[0])+2] ? content[content.indexOf(title[0])+2].trim() : "(placeholder)"
}
return description
}
async __write_page({title, suffix, slug, beta, notebook, addedSince, lastModified, deprecateSince, path, type, token, sidebar_position, sidebar_label, keywords, doc_card_list}) {
let markdown = await this.__markdown()
markdown = this.__filter_content(markdown, this.targets)
markdown = markdown.replace(/(\s*\n){3,}/g, '\n\n').replace(/(
){2,}/, "
").replace(/
/g, '
');
markdown = markdown.replace(/^[\||\s][\s|\||
]*\|\n/gm, '')
markdown = markdown.replace(/\s*
<\/p><\/td>\n)*\s*<\/tr>/g, '');
}
__example_http_urls(content) {
// Find all fenced code blocks and mark their ranges
const codeBlockRegex = /```[\s\S]*?```/g;
let codeBlocks = [];
let match;
while ((match = codeBlockRegex.exec(content)) !== null) {
codeBlocks.push({ start: match.index, end: match.index + match[0].length });
}
// Helper to check if a position is inside any code block
function isInCodeBlock(pos) {
return codeBlocks.some(block => pos >= block.start && pos < block.end);
}
// Match URLs, including those containing <, >, [, ], {, }
const urlRegex = /https?:\/\/[^\s'")]+/g;
let result = '';
let lastIndex = 0;
// Find all URLs and process those outside code blocks
while ((match = urlRegex.exec(content)) !== null) {
const urlStart = match.index;
const urlEnd = urlStart + match[0].length;
// Append content before the URL
result += content.slice(lastIndex, urlStart);
if (!isInCodeBlock(urlStart)) {
// If the url contains <, [, or {, treat it as an example and encode it
if (/[<\[\{]/.test(match[0])) {
result += match[0].replace('http', 'http')
} else {
result += match[0];
}
} else {
// Inside code block, leave as is
result += match[0];
}
lastIndex = urlEnd;
}
// Append remaining content
result += content.slice(lastIndex);
return result;
}
async __mdx_patches(content) {
try {
// Import MDX compiler dynamically as it's an ES module
const { compile } = await import('@mdx-js/mdx');
let patchedContent = content;
let maxIterations = 50; // Prevent infinite loops
let iteration = 0;
while (iteration < maxIterations) {
try {
// Try to compile the current content
await compile(patchedContent, { development: false });
console.log(`MDX compilation succeeded after ${iteration} fixes`);
return patchedContent; // If compilation succeeds, return the fixed content
} catch (error) {
console.log(`MDX compilation error detected (iteration ${iteration + 1}): ${error.message}`);
// console.log(error)
// Identify problematic characters based on the error
let madeChanges = false;
let line, column, offset;
switch (error.ruleId) {
case 'acorn':
line = error.place.line;
column = error.place.column;
offset = error.place.offset;
// console.log(patchedContent.split('\n')[line-1]);
if (offset !== undefined && offset > 0 && offset < patchedContent.length) {
for (let i = offset - 1; i >= 0; i--) {
if (patchedContent[i] === '{') {
patchedContent = patchedContent.slice(0, i) + '\\' + patchedContent.slice(i);
madeChanges = true;
break;
}
}
}
break;
case 'end-tag-mismatch':
let tag = error.message.match(/<(?!\/)([A-Za-z][A-Za-z0-9:_-]*)\b[^>]*>/g)?.[0];
let pos = error.message.match(/(\d+):(\d+)-(\d+):(\d+)/);
if (tag && pos) {
const start = { line: parseInt(pos[1]), column: parseInt(pos[2]) }
patchedContent = patchedContent.split('\n').map((line, index) => {
if (index === start.line - 1) {
line = line.slice(0, start.column - 1) + '\\' + line.slice(start.column - 1)
madeChanges = true;
}
return line
}).join('\n')
}
break;
case 'unexpected-closing-slash':
// For this specific error "Unexpected closing slash `/` in tag, expected an open tag first"
// it typically means there's a stray `` tag or similar erroneous closing tag
// Remove erroneous closing tags at the end of document
const originalContent = patchedContent;
patchedContent = patchedContent.replace(/<\/(?:content|[\w\d]+)>\s*$/, '');
if (originalContent !== patchedContent) {
madeChanges = true;
} else {
// If no match at end, look for the erroneous tag anywhere in the content
// that might be causing the slash error
patchedContent = patchedContent.replace(/<[/](\w+)>/g, (match, tagName) => {
// If this tag doesn't have a matching opening tag, remove it
const openingTagCount = (patchedContent.match(new RegExp(`<${tagName}(?:\\s|>|/>)`, 'g')) || []).length;
const closingTagCount = (patchedContent.match(new RegExp(`<\\/${tagName}>`, 'g')) || []).length;
// If there are more closing tags than opening tags, this closing tag is erroneous
if (closingTagCount > openingTagCount) {
return ''; // Remove the erroneous closing tag
}
return match;
});
if (originalContent !== patchedContent) {
madeChanges = true;
}
}
break;
case 'unexpected-character':
if (error.message.includes('U+002C') || error.message.includes('U+002A')) {
offset = error.place.offset;
if (offset !== undefined && offset > 0 && offset < patchedContent.length) {
for (let i = offset-1; i >= 0; i--) {
if (patchedContent[i] === '<') {
patchedContent = patchedContent.slice(0, i) + '\\' + patchedContent.slice(i);
madeChanges = true;
break;
}
}
}
}
break;
default:
madeChanges = false;
break;
}
if (!madeChanges) {
console.warn('No changes made to content, breaking loop to prevent infinite iteration');
break;
}
}
iteration++;
}
if (iteration >= maxIterations) {
console.warn(`Maximum MDX patch iterations (${maxIterations}) reached, returning last attempt`);
}
return patchedContent;
} catch (error) {
console.error('Failed to import MDX compiler:', error.message);
return content; // Return original content if compiler import fails
}
}
async __page(page) {
return '# ' + await this.__text_elements(page['elements']);
}
async __text(text) {
return await this.__text_elements(text['elements']);
}
async __heading(heading, level) {
let content = await this.__text_elements(heading['elements'])
content = this.__clean_headings(content)
if (content.length > 0) {
if (content.indexOf('{#') < 0) {
let slug = slugify(content.split('|')[0].trim(), {lower: true, strict: true})
return '#'.repeat(level) + ' ' + content + '{#'+slug+'}';
} else {
return '#'.repeat(level) + ' ' + content;
}
} else {
return '';
}
}
__clean_headings(content) {
// filter content
content = this.__filter_content(content, this.targets)
// remove html tags
content = content.replace(/<\/?[^>]+(>|$)/g, "")
// remove trailing and leading spaces
content = content.trim()
return content
}
async __bullet(block, indent) {
let children = ''
if (block.children) {
children = block.children.map(child => {
return this.__retrieve_block_by_id(child)
})
children = await this.__markdown(children, indent+4)
}
let content = await this.__text_elements(block['bullet']['elements'])
return ' '.repeat(indent) + '- ' + content + '\n\n' + children;
}
async __ordered(block, indent) {
let children = ''
if (block.children) {
children = block.children.map(child => {
return this.__retrieve_block_by_id(child)
})
children = await this.__markdown(children, indent+4)
}
let content = await this.__text_elements(block['ordered']['elements'])
return ' '.repeat(indent) + '1. ' + content + '\n\n' + children;
}
async __callout(block, indent) {
let children = []
if (block.children) {
children = block.children.map(child => {
return this.__retrieve_block_by_id(child)
})
children = await this.__markdown(children, indent)
children = this.__filter_content(children, this.targets)
children = children.split('\n')
}
let emoji = block['callout']['emoji_id']
let type;
switch (emoji) {
case 'blue_book':
type = `
');
}));
const row_size = table['property']['row_size'];
const column_size = table['property']['column_size'];
var merge_info = table['property']['merge_info'];
merge_info = merge_info.map((merge, idx) => {
if (merge) {
for (var i = 1; i < merge.col_span; i++) {
merge_info[idx+i] = null;
}
for (var j = 1; j < merge.row_span; j++) {
merge_info[idx+j*column_size] = null;
}
}
return merge
})
var html = ' '.repeat(indent) + '\n';
for (var i = 0; i < row_size; i++) {
html += ' '.repeat(indent) +'
\n';
return html;
}
async __sheet(sheet, indent) {
const converter = new showdown.Converter({ underline: true })
const merges = sheet.meta?.data.sheet.merges;
const values = sheet.values.data.valueRange.values;
var result = ' '.repeat(indent) + "\n';
for (var j = 0; j < column_size; j++) {
const cell_idx = i * column_size + j;
const merge = merge_info[cell_idx];
if (merge) {
const colspan = merge.col_span > 1 ? ` colspan="${merge.col_span}"` : "";
const rowspan = merge.row_span > 1 ? ` rowspan="${merge.row_span}"` : "";
let cell_text = this.__filter_content(cell_texts[cell_idx], this.targets).trim()
.replace(/^\n/, '')
.replace(/ \n';
}
html += ' '.repeat(indent) + '
/g, '\n\n');
cell_text = converter.makeHtml(cell_text)
.replace(/\n/g, '')
.replace(/&/g, '&')
.replace(/\*/g, '*');
if (i === 0) {
html += ` ${' '.repeat(indent)} ${cell_text} \n`;
} else {
html += ` ${' '.repeat(indent)} ${cell_text} \n`;
}
}
}
html += ' '.repeat(indent) +' " + "\n";
values.forEach((row, ridx) => {
result += ' '.repeat(indent) + ' ' + "
" + "\n";
return result.replace('"{', '"\\{');
}
__sheet_cell(cell) {
if (cell instanceof Array) {
return cell.map(block => {
if (block['type'] === 'text') {
return block['text']
}
if (block['type'] === 'url') {
return `${block['text']}`
}
}).join('')
} else {
console.log(cell)
return ''
}
}
async __supademo(addons, indent) {
const record = JSON.parse(addons['record']);
return ' '.repeat(indent) + `" + "\n";
row.forEach((cell, cidx) => {
var colspan = "";
var rowspan = "";
if (merges) {
const match = merges.filter(merge => merge.start_row_index === ridx && merge.start_column_index === cidx);
if (match.length > 0) {
colspan = `colspan="${match[0].end_column_index -match[0].start_column_index + 1}"`;
rowspan = `rowspan="${match[0].end_row_index -match[0].start_row_index + 1}"`;
}
}
if (typeof cell ==='string') {
cell = cell.replace(/\n/g, ' " + "\n"
});
result += ' '.repeat(indent) + "
')
}
if (typeof cell === 'object') {
cell = this.__sheet_cell(cell)
}
if (typeof cell === 'number') {
cell = cell.toString()
}
cell = cell.trim().replace(/
/g, '\n\n');
if (ridx === 0) {
result += `${' '.repeat(indent) + ' '.repeat(2)}${converter.makeHtml(cell).replace(/\n/g, '')} \n`
} else {
result += `${' '.repeat(indent) + ' '.repeat(2)}${converter.makeHtml(cell).replace(/\n/g, '')} \n`
}
})
result += ' '.repeat(indent) + ' ' + "