diff --git a/packages/core/src/lib/turndown.ts b/packages/core/src/lib/turndown.ts index 1f0f2523..dada0b95 100644 --- a/packages/core/src/lib/turndown.ts +++ b/packages/core/src/lib/turndown.ts @@ -14,6 +14,18 @@ import { createLogger } from './logger' const logger = createLogger('Turndown') +// ============ 源平台链接清理规则 ============ + +/** 需要去除的站内链接域名(去掉 只保留文本) */ +export const SOURCE_LINK_REMOVE_DOMAINS = [ + 'zhida.zhihu.com', +] + +/** 跳转中转规则:domain → 真实 URL 所在的 query 参数名 */ +export const SOURCE_LINK_REDIRECT_RULES: Array<{ domain: string; param: string }> = [ + { domain: 'link.zhihu.com', param: 'target' }, +] + // ============ HTML 实体解码工具 ============ /** @@ -443,6 +455,31 @@ function addExtensionRules(turndownService: TurndownService): void { } }) + // 源平台链接清理(站内链接去除、跳转中转还原) + turndownService.addRule('sourcePlatformLinks', { + filter: function(node) { + if (node.nodeName !== 'A') return false + const href = (node as Element).getAttribute('href') || '' + return SOURCE_LINK_REMOVE_DOMAINS.some(d => href.includes(d)) + || SOURCE_LINK_REDIRECT_RULES.some(r => href.includes(r.domain)) + }, + replacement: function(content, node) { + const href = (node as Element).getAttribute('href') || '' + if (SOURCE_LINK_REMOVE_DOMAINS.some(d => href.includes(d))) { + return content + } + const rule = SOURCE_LINK_REDIRECT_RULES.find(r => href.includes(r.domain)) + if (rule) { + try { + const url = new URL(href) + const real = url.searchParams.get(rule.param) + if (real) return '[' + content + '](' + real + ')' + } catch {} + } + return '[' + content + '](' + href + ')' + } + }) + // 保留没有表头的表格(作为 HTML) turndownService.keep(function(node) { try { diff --git a/packages/extension/src/lib/content-processor.ts b/packages/extension/src/lib/content-processor.ts index b98f9e8c..26d77a50 100644 --- a/packages/extension/src/lib/content-processor.ts +++ b/packages/extension/src/lib/content-processor.ts @@ -9,7 +9,7 @@ * 5. Service Worker 只做图片上传 + 调用 API */ -import { htmlToMarkdownNative, type PreprocessConfig } from '@wechatsync/core' +import { htmlToMarkdownNative, SOURCE_LINK_REMOVE_DOMAINS, SOURCE_LINK_REDIRECT_RULES, type PreprocessConfig } from '@wechatsync/core' import { createLogger } from './logger' const logger = createLogger('ContentProcessor') @@ -77,6 +77,9 @@ export function preprocessForPlatform(rawHtml: string, config: PreprocessConfig) // 移除 script 和 noscript(总是执行),style 根据配置决定 removeElements(container, config.keepStyles ? ['script', 'noscript'] : ['script', 'style', 'noscript']) + // 清理源平台链接(固定步骤,在 removeLinks 之前执行) + cleanSourcePlatformLinks(container) + if (config.removeLinks) { processLinks(container, config.keepLinkDomains) } @@ -235,6 +238,34 @@ function processSvgImages(container: HTMLElement): void { }) } +/** + * 清理源平台链接(站内链接去除、跳转中转还原) + * 规则定义在 @wechatsync/core SOURCE_LINK_*_DOMAINS,新增平台只需加域名 + */ +function cleanSourcePlatformLinks(container: HTMLElement): void { + const links = Array.from(container.querySelectorAll('a')) + for (const link of links) { + const href = link.getAttribute('href') || '' + if (SOURCE_LINK_REMOVE_DOMAINS.some(d => href.includes(d))) { + const parent = link.parentNode + if (!parent) continue + while (link.firstChild) { + parent.insertBefore(link.firstChild, link) + } + parent.removeChild(link) + continue + } + const rule = SOURCE_LINK_REDIRECT_RULES.find(r => href.includes(r.domain)) + if (rule) { + try { + const url = new URL(href) + const real = url.searchParams.get(rule.param) + if (real) link.setAttribute('href', real) + } catch {} + } + } +} + /** * 处理链接 */