From b4d35f5139fde272f77127221063700d76e4e50b Mon Sep 17 00:00:00 2001 From: JayWP <2316468683@qq.com> Date: Mon, 13 Apr 2026 18:04:53 +0800 Subject: [PATCH] vault backup: 2026-04-13 18:04:51 --- .claude/skills/kb/README.md | 160 ++++++++ .claude/skills/kb/SKILL.md | 327 +++++++++++++++ .claude/skills/kb/index.html | 381 ++++++++++++++++++ .../skills/kb/scripts/extractors/__init__.py | 0 .../kb/scripts/extractors/docx_extractor.py | 28 ++ .../kb/scripts/extractors/excel_extractor.py | 34 ++ .../kb/scripts/extractors/image_extractor.py | 20 + .../kb/scripts/extractors/pdf_extractor.py | 34 ++ .claude/skills/kb/scripts/ingest.py | 102 +++++ .claude/skills/kb/scripts/requirements.txt | 6 + .claude/skills/kb/templates/MASTER-INDEX.md | 4 + .claude/skills/kb/templates/ONTOLOGY.md | 50 +++ .claude/skills/kb/templates/RAW-REGISTRY.md | 4 + .claude/skills/kb/templates/TOPIC-MAP.md | 3 + .../conv-1776074446367-y9l6jom6z.meta.json | 23 ++ .obsidian/community-plugins.json | 3 +- .obsidian/plugins/claudian/data.json | 11 + ...¼Œå‘Šåˆ« RAG 幻觉?我åšäº†ä¸€ä¸ªå·¥å…·ï¼ŒæŠŠè¿™å¥—方法è½åœ°äº†.md | 0 ok.md | 1 - 未命å.canvas | 1 - 20 files changed, 1189 insertions(+), 3 deletions(-) create mode 100644 .claude/skills/kb/README.md create mode 100644 .claude/skills/kb/SKILL.md create mode 100644 .claude/skills/kb/index.html create mode 100644 .claude/skills/kb/scripts/extractors/__init__.py create mode 100644 .claude/skills/kb/scripts/extractors/docx_extractor.py create mode 100644 .claude/skills/kb/scripts/extractors/excel_extractor.py create mode 100644 .claude/skills/kb/scripts/extractors/image_extractor.py create mode 100644 .claude/skills/kb/scripts/extractors/pdf_extractor.py create mode 100644 .claude/skills/kb/scripts/ingest.py create mode 100644 .claude/skills/kb/scripts/requirements.txt create mode 100644 .claude/skills/kb/templates/MASTER-INDEX.md create mode 100644 .claude/skills/kb/templates/ONTOLOGY.md create mode 100644 .claude/skills/kb/templates/RAW-REGISTRY.md create mode 100644 .claude/skills/kb/templates/TOPIC-MAP.md create mode 100644 .claudian/sessions/conv-1776074446367-y9l6jom6z.meta.json create mode 100644 .obsidian/plugins/claudian/data.json rename 测试.md => Karpathy 让 AI 自己管知识库,告别 RAG 幻觉?我åšäº†ä¸€ä¸ªå·¥å…·ï¼ŒæŠŠè¿™å¥—方法è½åœ°äº†.md (100%) delete mode 100644 ok.md delete mode 100644 未命å.canvas diff --git a/.claude/skills/kb/README.md b/.claude/skills/kb/README.md new file mode 100644 index 0000000..6ab7051 --- /dev/null +++ b/.claude/skills/kb/README.md @@ -0,0 +1,160 @@ +# /kb — LLM 知识库管ç†å·¥å…· + +基于 Karpathy çš„ LLM Knowledge Base 模å¼ï¼šraw/ 存原始资料,LLM ç¼–è¯‘æˆ wiki/,索引替代 RAG。 + +## 快速开始 + +### 1. åˆå§‹åŒ–知识库 + +``` +/kb init +``` + +在当å‰ç›®å½•创建知识库目录结构: +- `raw/` — 原始资料(åªè¯»ï¼‰ +- `wiki/concepts/` — 核心概念 +- `wiki/sources/` — æ¥æºæ‘˜è¦ +- `wiki/comparisons/` — å¯¹æ¯”åˆ†æž +- `output/analysis/` — åˆ†æžæŠ¥å‘Š +- `output/slides/` — å¹»ç¯ç‰‡ +- `index/` — 索引文件 + +### 2. 导入文件 + +å°† PDFã€Excelã€å›¾ç‰‡ã€Word 文档放入 `raw/` 目录,然åŽï¼š + +``` +/kb ingest +``` + +自动æå–文本并登记到索引。 + +### 3. 编译为 Wiki + +``` +/kb compile +``` + +LLM 读å–原料,生æˆç»“构化 wiki 文章。 + +### 4. 查询知识库 + +``` +/kb query "你的问题" +``` + +生æˆç»“构化报告,包å«åˆ†æžã€ç»“论和回填建议。 + +### 5. 回填有价值的结果 + +``` +/kb file +``` + +将查询报告中有价值的内容并入 wiki。 + +### 6. å¥åº·æ£€æŸ¥ + +``` +/kb lint +``` + +六项检查:断链ã€å­¤å²›ã€æº¯æºã€ä¸€è‡´æ€§ã€è¦†ç›–度ã€ç©ºç™½å‘现。 + +### 7. æŸ¥çœ‹çŠ¶æ€ + +``` +/kb status +``` + +仪表盘展示整体å¥åº·åº¦å’Œç»Ÿè®¡ä¿¡æ¯ã€‚ + +--- + +## å­å‘½ä»¤é€ŸæŸ¥ + +| 命令 | 功能 | 触å‘è¯ | +|------|------|--------| +| `kb init [目录]` | åˆå§‹åŒ–知识库 | "åˆå§‹åŒ–"ã€"创建知识库" | +| `kb ingest` | é¢„å¤„ç† raw/ 文件 | "导入"ã€"å¤„ç†æ–°æ–‡ä»¶" | +| `kb compile [文件]` | 编译为 wiki | "编译"ã€"æ›´æ–° wiki" | +| `kb query "<问题>"` | 查询知识库 | "查知识库"ã€"问知识库" | +| `kb file [报告]` | 回填到 wiki | "回填"ã€"å½’æ¡£" | +| `kb lint` | å¥åº·æ£€æŸ¥ | "检查"ã€"lint" | +| `kb status` | 状æ€ä»ªè¡¨ç›˜ | "状æ€"ã€"看看知识库" | + +--- + +## 支æŒçš„æ–‡ä»¶æ ¼å¼ + +| æ ¼å¼ | åŽç¼€ | 说明 | +|------|------|------| +| PDF | .pdf | æå–文本和图片 | +| Excel | .xlsx, .xls, .csv | æå–表格内容 | +| 图片 | .png, .jpg, .jpeg | OCR 文字识别 | +| Word | .docx | æå–段è½å’Œè¡¨æ ¼ | + +--- + +## 工作æµç¨‹ + +``` +投喂原料 LLM 编译 查询使用 + │ │ │ + â–¼ â–¼ â–¼ + raw/ ──────► wiki/ ──────► æŸ¥è¯¢åˆ†æž â”€â”€â”€â”€â”€â”€â–º 回填 + │ │ │ + 原始文件 结构化文章 知识增长 +``` + +--- + +## 目录结构 + +``` +{知识库根目录}/ +├── raw/ # 原始资料(åªè¯»ï¼‰ +│ └── .extracted/ # æå–的文本(自动生æˆï¼‰ +├── wiki/ +│ ├── concepts/ # 核心概念 +│ ├── sources/ # æ¥æºæ‘˜è¦ +│ └── comparisons/ # å¯¹æ¯”åˆ†æž +├── output/ +│ ├── analysis/ # 查询报告 +│ └── slides/ # å¹»ç¯ç‰‡ +├── index/ +│ ├── MASTER-INDEX.md # 全局索引 +│ ├── TOPIC-MAP.md # 主题分组 +│ ├── RAW-REGISTRY.md # 原始文件登记 +│ ├── LINT-REPORT.md # å¥åº·æ£€æŸ¥æŠ¥å‘Š +│ └── ONTOLOGY.md # 本体定义 +└── scripts/ + ├── ingest.py # 预处ç†è„šæœ¬ + └── extractors/ # 文件æå–器 +``` + +--- + +## Python ä¾èµ– + +首次使用需è¦å®‰è£…ä¾èµ–: + +```bash +pip install -r .claude/skills/kb/scripts/requirements.txt +``` + +ä¾èµ–列表: +- PyMuPDF — PDF æå– +- openpyxl — Excel è¯»å– +- pandas — æ•°æ®å¤„ç† +- pytesseract — 图片 OCR +- python-docx — Word è¯»å– +- Pillow — å›¾ç‰‡å¤„ç† + +--- + +## SessionStart Hook(å¯é€‰ï¼‰ + +é…ç½®åŽï¼Œæ¯æ¬¡æ‰“å¼€ Claude Code 会自动检测 `raw/` 中的新文件并æé†’处ç†ã€‚ + +åˆå§‹åŒ–时选择"是"å³å¯å¯ç”¨ã€‚ diff --git a/.claude/skills/kb/SKILL.md b/.claude/skills/kb/SKILL.md new file mode 100644 index 0000000..40dfca1 --- /dev/null +++ b/.claude/skills/kb/SKILL.md @@ -0,0 +1,327 @@ +--- +name: kb +description: | + LLM 驱动的知识库管ç†å·¥å…·ç®±ã€‚当用户说"kb"ã€"知识库"ã€"查知识库"ã€"åˆå§‹åŒ–知识库"ã€"导入文件"ã€"编译"ã€"回填"等时触å‘。 + 支æŒå¯¹ vault æˆ–å¤–éƒ¨ç›®å½•å»ºç«‹çŸ¥è¯†åº“ï¼šé¢„å¤„ç†æ–‡ä»¶ã€ç¼–译 wikiã€æŸ¥è¯¢åˆ†æžã€å¥åº·æ£€æŸ¥ã€‚ + 基于 Karpathy çš„ LLM Knowledge Base 模å¼ï¼šraw/ 存原始资料,LLM ç¼–è¯‘æˆ wiki/,索引替代 RAG。 +user-invocable: true +--- + +# /kb — LLM çŸ¥è¯†åº“ç®¡ç† + +统一入å£ï¼ŒåŒ…å« 7 个å­å‘½ä»¤ã€‚ + +## å­å‘½ä»¤é€ŸæŸ¥ + +| 命令 | 功能 | 触å‘è¯ | +|------|------|--------| +| `kb init [目录]` | åˆå§‹åŒ–知识库 | "åˆå§‹åŒ–"ã€"创建知识库" | +| `kb ingest` | é¢„å¤„ç† raw/ 文件 | "导入"ã€"å¤„ç†æ–°æ–‡ä»¶" | +| `kb compile [文件]` | 编译为 wiki | "编译"ã€"æ›´æ–° wiki" | +| `kb query "<问题>"` | 查询知识库 | "查知识库"ã€"问知识库" | +| `kb file [报告]` | 回填到 wiki | "回填"ã€"å½’æ¡£" | +| `kb lint` | å¥åº·æ£€æŸ¥ | "检查"ã€"lint" | +| `kb status` | 状æ€ä»ªè¡¨ç›˜ | "状æ€"ã€"看看知识库" | + +--- + +## kb init [目标目录] + +åˆå§‹åŒ–知识库目录结构ã€ç´¢å¼•和本体定义。 + +**傿•°**:å¯é€‰ç›®æ ‡ç›®å½•,默认当å‰ç›®å½•(vault)或指定外部目录。 + +### 执行步骤 + +1. **检查现有知识库**:查找 `{target}/index/MASTER-INDEX.md`,如果存在则警告并等待确认 + +2. **创建目录结构**: + ``` + {target}/raw/ — 原始资料(åªè¯»ï¼‰ + {target}/wiki/concepts/ — 核心概念 + {target}/wiki/sources/ — æ¥æºæ‘˜è¦ + {target}/wiki/comparisons/ — å¯¹æ¯”åˆ†æž + {target}/output/analysis/ — åˆ†æžæŠ¥å‘Š + {target}/output/slides/ — å¹»ç¯ç‰‡ + {target}/index/ — 索引文件 + {target}/scripts/ — 预处ç†è„šæœ¬ + ``` + +3. **å¤åˆ¶æ¨¡æ¿æ–‡ä»¶**:从本 Skill çš„ `templates/` 目录å¤åˆ¶åˆ° `{target}/index/`: + - ONTOLOGY.md — 实体类型和关系定义 + - MASTER-INDEX.md — 全局索引 + - TOPIC-MAP.md — 主题分组 + - RAW-REGISTRY.md — 原始文件登记 + +4. **å¤åˆ¶è„šæœ¬**:从本 Skill çš„ `scripts/` 目录å¤åˆ¶åˆ° `{target}/scripts/` + +5. **检查 Python ä¾èµ–**: + ```bash + pip show pymupdf openpyxl pandas pytesseract python-docx Pillow 2>&1 + ``` + 报告缺失的包,询问是å¦å®‰è£… + +6. **é…ç½® SessionStart Hook(å¯é€‰ï¼‰**:询问是å¦é…置,检测 raw/ 新文件时æé†’ + +7. **输出åˆå§‹åŒ–摘è¦** + +--- + +## kb ingest + +é¢„å¤„ç† raw/ 中的新文件并登记到索引。 + +**å‰ç½®æ¡ä»¶**:知识库已åˆå§‹åŒ–(存在 index/RAW-REGISTRY.md) + +### æ”¯æŒæ ¼å¼ +- PDF (.pdf) +- Excel (.xlsx, .xls, .csv) +- 图片 (.png, .jpg, .jpeg) — OCR æå– +- Word (.docx) + +### 执行步骤 + +1. **定ä½çŸ¥è¯†åº“**:å‘上查找 `index/RAW-REGISTRY.md` + +2. **è¿è¡Œé¢„处ç†è„šæœ¬**: + ```bash + python3 {skill_dir}/scripts/ingest.py {kb_root} + ``` + è„šæœ¬è‡ªåŠ¨ï¼šæ‰«ææ–°æ–‡ä»¶ → 按类型æå–文本 → è¾“å‡ºæ‘˜è¦ + +3. **登记到 RAW-REGISTRY.md**:为æ¯ä¸ªæ–°æ–‡ä»¶æ·»åŠ æ¡ç›®ï¼š + - 文件路径ã€ç±»åž‹ã€æ‘˜è¦ï¼ˆä¸€å¥è¯ï¼‰ + - 状æ€ï¼š`pending`(待编译) + +4. **输出摘è¦**:报告导入数é‡ï¼Œæç¤ºä¸‹ä¸€æ­¥ `/kb-compile` + +--- + +## kb compile [文件] + +å°† raw/ 中已导入但未编译的文件编译为 wiki 文章。 + +**傿•°**:å¯é€‰æŒ‡å®šæ–‡ä»¶ï¼Œé»˜è®¤å¤„ç†æ‰€æœ‰ `status=pending` çš„æ¡ç›® + +### 核心原则 +- Wiki 文章由 LLM 生æˆï¼Œéµå¾ª ONTOLOGY.md 定义 +- æ¯ç¯‡æ–‡ç« å¿…须有完整 YAML frontmatter +- 使用 `[[åŒé“¾]]` å»ºç«‹å…³è” +- 编译是增é‡çš„ + +### 执行步骤 + +1. **检查待编译æ¡ç›®**:读 `index/RAW-REGISTRY.md`,找 `status=pending` çš„æ¡ç›® + - å¦‚æžœæ²¡æœ‰ï¼Œå‘ŠçŸ¥ç”¨æˆ·å¹¶ç»“æŸ + +2. **加载上下文**:读 ONTOLOGY.mdã€MASTER-INDEX.mdã€TOPIC-MAP.md + +3. **é€ä¸ªç¼–译**: + - è¯»å–æºæ–‡ä»¶æˆ– `raw/.extracted/` 下的æå–文本 + - 判断æ“作:新建 / 更新已有 / 综åˆåˆ†æž + - 按模æ¿ç”Ÿæˆ wiki 文章 + - æ›´æ–° frontmatter(type, id, compiled_from, related, last_compiled) + - 用 `[[åŒé“¾]]` 链接相关文章 + +4. **更新索引**: + - MASTER-INDEX.md 添加/æ›´æ–°æ¡ç›® + - TOPIC-MAP.md 归入主题 + - RAW-REGISTRY.md çŠ¶æ€æ”¹ä¸º `done`,填编译产物路径 + +5. **输出编译摘è¦** + +--- + +## kb query "<问题>" + +对知识库æé—®ï¼Œç”Ÿæˆç»“构化报告。 + +**傿•°**:必填,用户的问题 + +### 执行步骤 + +1. **定ä½çŸ¥è¯†åº“**:查找 `index/MASTER-INDEX.md` + +2. **检索相关文章**: + - 读 MASTER-INDEX.md 定ä½ç›¸å…³æ–‡ä»¶ + - 按需读 TOPIC-MAP.md ç²¾ç¡®å®šä½ + - è¯»å–æ‰€æœ‰ç›¸å…³ wiki 文章内容 + +3. **研究分æž**: + - 基于 wiki 内容深入分æžé—®é¢˜ + - 交å‰å¯¹æ¯”多篇文章 + - ç»“è®ºå¿…é¡»åŸºäºŽå®žé™…å†…å®¹ï¼Œæ ‡æ³¨æ¥æº + +4. **ç”ŸæˆæŠ¥å‘Š**:ä¿å­˜åˆ° `output/analysis/YYYY-MM-DD-{topic-slug}.md`: + ```markdown + # {报告标题} + + - **Date**: YYYY-MM-DD + - **Query**: {用户问题} + - **Sources**: {引用的 wiki 文章} + + --- + + ## åˆ†æž + {详细分æžï¼Œå¼•用具体文章用 [[åŒé“¾]]} + + ## 结论 + {核心å‘现} + + ## 回填建议 + - [ ] {具体建议} + ``` + +5. **输出结果**:展示摘è¦ï¼Œæç¤ºå¯è¿è¡Œ `/kb file` 回填 + +--- + +## kb file [报告路径] + +将查询输出回填到 wiki 知识库。 + +**傿•°**:å¯é€‰æŒ‡å®š output/ 下的报告文件,默认扫æ `output/analysis/` + +### 执行步骤 + +1. **定ä½çŸ¥è¯†åº“和待回填内容** + +2. **展示回填建议**:列出所有建议,编å·è¯´æ˜Ž + +3. **用户确认**ï¼šé€æ¡ Y/N æˆ–æ‰¹é‡æ“作 + +4. **执行回填**: + - **更新已有文章**:将新内容有机èžå…¥ + - **新建文章**:按 ONTOLOGY.md 模æ¿åˆ›å»º + +5. **更新索引**:MASTER-INDEX.md å’Œ TOPIC-MAP.md + +6. **输出摘è¦** + +--- + +## kb lint + +对知识库进行六项å¥åº·æ£€æŸ¥ã€‚ + +### 检查项目 + +| 检查 | 说明 | +|------|------| +| 断链 | `[[链接]]` 指å‘ä¸å­˜åœ¨çš„æ–‡ä»¶ | +| 孤岛 | 没有被任何文章链接的文章 | +| æº¯æº | frontmatter compiled_from 指å‘已删除的文件 | +| 一致性 | åŒä¸€æ¦‚念在ä¸åŒæ–‡ç« ä¸­çš„矛盾æè¿° | +| 覆盖度 | 未编译文件比例 | +| 空白å‘现 | 被æåŠä½†æ²¡æœ‰ç‹¬ç«‹æ–‡ç« çš„æ¦‚念 | + +### 执行步骤 + +1. **定ä½çŸ¥è¯†åº“** + +2. **执行六项检查** + +3. **输出 Lint 报告**(按严é‡ç¨‹åº¦æŽ’åºï¼‰ + +4. **æä¾›ä¿®å¤é€‰é¡¹**:å¯è‡ªåŠ¨ä¿®å¤çš„é—®é¢˜è¯¢é—®æ˜¯å¦æ‰§è¡Œ + +5. **ä¿å­˜æŠ¥å‘Šåˆ° `index/LINT-REPORT.md`** + +--- + +## kb status + +展示知识库整体状æ€ä»ªè¡¨ç›˜ã€‚ + +### 执行步骤 + +1. **定ä½çŸ¥è¯†åº“** + +2. **收集统计数æ®**: + - raw/ 文件数 + - wiki/ 文章数和字数 + - 编译率 + - 待回填报告数 + - 上次 lint 结果 + +3. **展示仪表盘**: + ``` + çŸ¥è¯†åº“çŠ¶æ€ + â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â• + 原始文件: N 个 + Wiki 文章: M 篇 (å…± ~X å­—) + 编译率: XX% + 待回填: Y 份报告 + 上次 Lint: 日期 — é—®é¢˜æ‘˜è¦ + â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â• + + 最近编译的文章: + - wiki/concepts/xxx.md (日期) + + 待处ç†: + - N 个文件待编译 → /kb compile + - M 份报告待回填 → /kb file + ``` + +4. **建议下一步æ“作** + +--- + +## 目录结构约定 + +``` +{知识库根目录}/ +├── raw/ # 原始资料(åªè¯»ï¼‰ +│ └── .extracted/ # æå–的文本(自动生æˆï¼‰ +├── wiki/ +│ ├── concepts/ # 核心概念 +│ ├── sources/ # æ¥æºæ‘˜è¦ +│ └── comparisons/ # å¯¹æ¯”åˆ†æž +├── output/ +│ ├── analysis/ # 查询报告 +│ └── slides/ # å¹»ç¯ç‰‡ +├── index/ +│ ├── MASTER-INDEX.md # 全局索引 +│ ├── TOPIC-MAP.md # 主题分组 +│ ├── RAW-REGISTRY.md # 原始文件登记 +│ ├── LINT-REPORT.md # å¥åº·æ£€æŸ¥æŠ¥å‘Š +│ └── ONTOLOGY.md # 本体定义 +└── scripts/ + ├── ingest.py # 预处ç†è„šæœ¬ + ├── requirements.txt # Python ä¾èµ– + └── extractors/ # å„类文件æå–器 +``` + +## 实体类型(ONTOLOGY.md) + +| 类型 | 目录 | 命å规则 | +|------|------|----------| +| concept | wiki/concepts/ | {slug}.md | +| source | wiki/sources/ | {slug}.md | +| comparison | wiki/comparisons/ | {a}-vs-{b}.md | + +## Wiki 文章 Frontmatter æ¨¡æ¿ + +```yaml +--- +type: concept +id: {slug} +aliases: [] +compiled_from: + - raw/{source_file} +related: + - "[[other-article]]" +last_compiled: YYYY-MM-DD +--- +``` + +--- + +## 故障排除 + +| 问题 | 解决方案 | +|------|----------| +| 找ä¸åˆ°çŸ¥è¯†åº“ | å…ˆè¿è¡Œ `/kb init` åˆå§‹åŒ– | +| 脚本报错 | è¿è¡Œ `pip install -r scripts/requirements.txt` | +| 编译率低 | è¿è¡Œ `/kb ingest` å¯¼å…¥æ–°æ–‡ä»¶ï¼Œç„¶åŽ `/kb compile` | +| 断链太多 | è¿è¡Œ `/kb lint` æŸ¥çœ‹è¯¦æƒ…ï¼Œæ‰‹åŠ¨ä¿®å¤æˆ–删除断链 | diff --git a/.claude/skills/kb/index.html b/.claude/skills/kb/index.html new file mode 100644 index 0000000..7fc9926 --- /dev/null +++ b/.claude/skills/kb/index.html @@ -0,0 +1,381 @@ + + + + + + /kb — LLM 知识库管ç†å·¥å…· + + + +
+

/kb — LLM 知识库管ç†å·¥å…·

+

基于 Karpathy çš„ LLM Knowledge Base 模å¼ï¼šraw/ 存原始资料,LLM ç¼–è¯‘æˆ wiki/,索引替代 RAG。

+ +

🚀 快速开始

+ +

1. åˆå§‹åŒ–知识库

+
/kb init
+

在当å‰ç›®å½•创建知识库目录结构:

+ +
+
+├── raw/                    # 原始资料(åªè¯»ï¼‰
+├── wiki/
+│   ├── concepts/          # 核心概念
+│   ├── sources/           # æ¥æºæ‘˜è¦
+│   └── comparisons/       # 对比分æž
+├── output/
+│   ├── analysis/          # åˆ†æžæŠ¥å‘Š
+│   └── slides/           # å¹»ç¯ç‰‡
+└── index/                # 索引文件
+      
+
+ +

2. 导入文件

+

å°† PDFã€Excelã€å›¾ç‰‡ã€Word 文档放入 raw/ 目录,然åŽï¼š

+
/kb ingest
+

自动æå–文本并登记到索引。

+ +

3. 编译为 Wiki

+
/kb compile
+

LLM 读å–原料,生æˆç»“构化 wiki 文章。

+ +

4. 查询知识库

+
/kb query "你的问题"
+

生æˆç»“构化报告,包å«åˆ†æžã€ç»“论和回填建议。

+ +

5. 回填有价值的结果

+
/kb file
+

将查询报告中有价值的内容并入 wiki。

+ +

6. å¥åº·æ£€æŸ¥

+
/kb lint
+

六项检查:断链ã€å­¤å²›ã€æº¯æºã€ä¸€è‡´æ€§ã€è¦†ç›–度ã€ç©ºç™½å‘现。

+ +

7. 查看状æ€

+
/kb status
+

仪表盘展示整体å¥åº·åº¦å’Œç»Ÿè®¡ä¿¡æ¯ã€‚

+ +

📋 å­å‘½ä»¤é€ŸæŸ¥

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
命令功能触å‘è¯
kb init [目录]åˆå§‹åŒ–知识库åˆå§‹åŒ–ã€åˆ›å»ºçŸ¥è¯†åº“
kb ingesté¢„å¤„ç† raw/ 文件导入ã€å¤„ç†æ–°æ–‡ä»¶
kb compile [文件]编译为 wikiç¼–è¯‘ã€æ›´æ–° wiki
kb query "<问题>"查询知识库查知识库ã€é—®çŸ¥è¯†åº“
kb file [报告]回填到 wiki回填ã€å½’æ¡£
kb lintå¥åº·æ£€æŸ¥æ£€æŸ¥ã€lint
kb status状æ€ä»ªè¡¨ç›˜çжæ€ã€çœ‹çœ‹çŸ¥è¯†åº“
+ +

📦 支æŒçš„æ–‡ä»¶æ ¼å¼

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
æ ¼å¼åŽç¼€è¯´æ˜Ž
PDF.pdfæå–文本和图片
Excel.xlsx, .xls, .csvæå–表格内容
图片.png, .jpg, .jpegOCR 文字识别
Word.docxæå–段è½å’Œè¡¨æ ¼
+ +

🔄 工作æµç¨‹

+ +
+
投喂原料
raw/
+ → +
LLM 编译
wiki/
+ → +
查询使用
/kb query
+ → +
知识增长
/kb file
+
+ +

📠完整目录结构

+ +
+{知识库根目录}/
+├── raw/                    # 原始资料(åªè¯»ï¼‰
+│   └── .extracted/        # æå–的文本(自动生æˆï¼‰
+├── wiki/
+│   ├── concepts/          # 核心概念
+│   ├── sources/           # æ¥æºæ‘˜è¦
+│   └── comparisons/       # 对比分æž
+├── output/
+│   ├── analysis/          # 查询报告
+│   └── slides/           # å¹»ç¯ç‰‡
+├── index/
+│   ├── MASTER-INDEX.md   # 全局索引
+│   ├── TOPIC-MAP.md      # 主题分组
+│   ├── RAW-REGISTRY.md   # 原始文件登记
+│   ├── LINT-REPORT.md    # å¥åº·æ£€æŸ¥æŠ¥å‘Š
+│   └── ONTOLOGY.md       # 本体定义
+└── scripts/
+    ├── ingest.py          # 预处ç†è„šæœ¬
+    └── extractors/        # 文件æå–器
+    
+ +

ðŸ Python ä¾èµ–

+ +

首次使用需è¦å®‰è£…ä¾èµ–:

+
pip install -r .claude/skills/kb/scripts/requirements.txt
+ +
+ ä¾èµ–列表: + +
+ +

âš™ï¸ SessionStart Hook(å¯é€‰ï¼‰

+ +

é…ç½®åŽï¼Œæ¯æ¬¡æ‰“å¼€ Claude Code 会自动检测 raw/ 中的新文件并æé†’处ç†ã€‚

+

åˆå§‹åŒ–时选择"是"å³å¯å¯ç”¨ã€‚

+ + +
+ + diff --git a/.claude/skills/kb/scripts/extractors/__init__.py b/.claude/skills/kb/scripts/extractors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.claude/skills/kb/scripts/extractors/docx_extractor.py b/.claude/skills/kb/scripts/extractors/docx_extractor.py new file mode 100644 index 0000000..33648eb --- /dev/null +++ b/.claude/skills/kb/scripts/extractors/docx_extractor.py @@ -0,0 +1,28 @@ +"""Extract text from Word documents.""" +from docx import Document +import os + + +def extract(docx_path: str, output_dir: str) -> str: + """Extract all paragraphs and tables from docx.""" + basename = os.path.splitext(os.path.basename(docx_path))[0] + txt_path = os.path.join(output_dir, f"{basename}.txt") + + doc = Document(docx_path) + parts = [] + + for para in doc.paragraphs: + if para.text.strip(): + parts.append(para.text) + + for i, table in enumerate(doc.tables): + parts.append(f"\n--- Table {i+1} ---") + for row in table.rows: + cells = [cell.text.strip() for cell in row.cells] + parts.append(" | ".join(cells)) + + with open(txt_path, "w", encoding="utf-8") as f: + f.write("\n".join(parts)) + + print(f" Word: {len(doc.paragraphs)} paragraphs, {len(doc.tables)} tables extracted") + return txt_path diff --git a/.claude/skills/kb/scripts/extractors/excel_extractor.py b/.claude/skills/kb/scripts/extractors/excel_extractor.py new file mode 100644 index 0000000..1414d20 --- /dev/null +++ b/.claude/skills/kb/scripts/extractors/excel_extractor.py @@ -0,0 +1,34 @@ +"""Extract text summary from Excel files.""" +import pandas as pd +import os + + +def extract(excel_path: str, output_dir: str) -> str: + """Read all sheets, output text summary.""" + basename = os.path.splitext(os.path.basename(excel_path))[0] + txt_path = os.path.join(output_dir, f"{basename}.txt") + + ext = os.path.splitext(excel_path)[1].lower() + if ext == ".csv": + df = pd.read_csv(excel_path) + parts = [f"--- CSV ({len(df)} rows x {len(df.columns)} cols) ---"] + parts.append(f"Columns: {', '.join(df.columns.astype(str))}") + parts.append(df.head(50).to_string(index=False)) + if len(df) > 50: + parts.append(f"... ({len(df) - 50} more rows)") + else: + xls = pd.ExcelFile(excel_path) + parts = [] + for sheet in xls.sheet_names: + df = pd.read_excel(xls, sheet_name=sheet) + parts.append(f"--- Sheet: {sheet} ({len(df)} rows x {len(df.columns)} cols) ---") + parts.append(f"Columns: {', '.join(df.columns.astype(str))}") + parts.append(df.head(50).to_string(index=False)) + if len(df) > 50: + parts.append(f"... ({len(df) - 50} more rows)") + + with open(txt_path, "w", encoding="utf-8") as f: + f.write("\n\n".join(parts)) + + print(f" Excel: extracted to {basename}.txt") + return txt_path diff --git a/.claude/skills/kb/scripts/extractors/image_extractor.py b/.claude/skills/kb/scripts/extractors/image_extractor.py new file mode 100644 index 0000000..3eee591 --- /dev/null +++ b/.claude/skills/kb/scripts/extractors/image_extractor.py @@ -0,0 +1,20 @@ +"""OCR text from images using pytesseract.""" +import pytesseract +from PIL import Image +import os + + +def extract(image_path: str, output_dir: str) -> str: + """OCR image, return text file path.""" + basename = os.path.splitext(os.path.basename(image_path))[0] + txt_path = os.path.join(output_dir, f"{basename}.txt") + + img = Image.open(image_path) + text = pytesseract.image_to_string(img, lang="chi_sim+eng") + + with open(txt_path, "w", encoding="utf-8") as f: + f.write(text) + + chars = len(text.strip()) + print(f" Image OCR: {chars} characters extracted") + return txt_path diff --git a/.claude/skills/kb/scripts/extractors/pdf_extractor.py b/.claude/skills/kb/scripts/extractors/pdf_extractor.py new file mode 100644 index 0000000..be92a91 --- /dev/null +++ b/.claude/skills/kb/scripts/extractors/pdf_extractor.py @@ -0,0 +1,34 @@ +"""Extract text and images from PDF files using PyMuPDF.""" +import fitz # PyMuPDF +import os + + +def extract(pdf_path: str, output_dir: str) -> str: + """Extract text from PDF, save images, return text file path.""" + doc = fitz.open(pdf_path) + text_parts = [] + img_count = 0 + + for page_num, page in enumerate(doc): + text_parts.append(f"--- Page {page_num + 1} ---") + text_parts.append(page.get_text()) + + for img_idx, img in enumerate(page.get_images(full=True)): + xref = img[0] + pix = fitz.Pixmap(doc, xref) + if pix.n > 4: + pix = fitz.Pixmap(fitz.csRGB, pix) + img_path = os.path.join(output_dir, f"page{page_num+1}_img{img_idx+1}.png") + pix.save(img_path) + img_count += 1 + pix = None + + doc.close() + + basename = os.path.splitext(os.path.basename(pdf_path))[0] + txt_path = os.path.join(output_dir, f"{basename}.txt") + with open(txt_path, "w", encoding="utf-8") as f: + f.write("\n".join(text_parts)) + + print(f" PDF: {len(text_parts)//2} pages, {img_count} images extracted") + return txt_path diff --git a/.claude/skills/kb/scripts/ingest.py b/.claude/skills/kb/scripts/ingest.py new file mode 100644 index 0000000..a36284c --- /dev/null +++ b/.claude/skills/kb/scripts/ingest.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Scan raw/ for new files, extract text, print summary for LLM to parse.""" +import importlib +import os +import sys + +# Add scripts dir to path so extractors can be imported +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +EXTRACTORS = { + ".pdf": "extractors.pdf_extractor", + ".xlsx": "extractors.excel_extractor", + ".xls": "extractors.excel_extractor", + ".csv": "extractors.excel_extractor", + ".png": "extractors.image_extractor", + ".jpg": "extractors.image_extractor", + ".jpeg": "extractors.image_extractor", + ".docx": "extractors.docx_extractor", +} +SKIP_EXT = {".md", ".txt"} +SKIP_DIRS = {".extracted"} + + +def scan_raw(raw_dir, registry_path): + """Find files in raw/ not yet in RAW-REGISTRY.md.""" + registered = set() + if os.path.exists(registry_path): + with open(registry_path, "r", encoding="utf-8") as f: + for line in f: + if line.startswith("| raw/") or line.startswith("| ./raw/"): + path = line.split("|")[1].strip() + registered.add(path) + + new_files = [] + for root, dirs, files in os.walk(raw_dir): + dirs[:] = [d for d in dirs if d not in SKIP_DIRS] + for fname in sorted(files): + fpath = os.path.join(root, fname) + rel = os.path.relpath(fpath, os.path.dirname(raw_dir)) + if rel not in registered: + new_files.append(fpath) + return new_files + + +def process_file(fpath): + """Extract text from a single file. Returns (txt_path, file_type) or (None, file_type).""" + ext = os.path.splitext(fpath)[1].lower() + extracted_dir = os.path.join(os.path.dirname(fpath), ".extracted") + os.makedirs(extracted_dir, exist_ok=True) + + if ext in SKIP_EXT: + return None, ext + + mod_name = EXTRACTORS.get(ext) + if not mod_name: + print(f" SKIP (unsupported): {os.path.basename(fpath)}") + return None, ext + + try: + extractor = importlib.import_module(mod_name) + txt_path = extractor.extract(fpath, extracted_dir) + return txt_path, ext + except ImportError as e: + print(f" ERROR (missing dependency): {e}") + return None, ext + except Exception as e: + print(f" ERROR: {e}") + return None, ext + + +def main(): + kb_root = sys.argv[1] if len(sys.argv) > 1 else os.getcwd() + raw_dir = os.path.join(kb_root, "raw") + registry = os.path.join(kb_root, "index", "RAW-REGISTRY.md") + + if not os.path.isdir(raw_dir): + print(f"ERROR: {raw_dir} not found") + sys.exit(1) + + new_files = scan_raw(raw_dir, registry) + if not new_files: + print("No new files in raw/") + return + + print(f"Found {len(new_files)} new file(s) in raw/:\n") + results = [] + for fpath in sorted(new_files): + print(f"Processing: {os.path.basename(fpath)}") + txt_path, ext = process_file(fpath) + results.append((fpath, txt_path, ext)) + + # Output summary for LLM to parse and update RAW-REGISTRY.md + print(f"\n--- INGEST SUMMARY ---") + print(f"Processed: {len(results)} files") + for fpath, txt_path, ext in results: + rel = os.path.relpath(fpath, kb_root) + status = "extracted" if txt_path else "ready" + print(f" {rel} [{ext}] -> {status}") + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/kb/scripts/requirements.txt b/.claude/skills/kb/scripts/requirements.txt new file mode 100644 index 0000000..7bbc86b --- /dev/null +++ b/.claude/skills/kb/scripts/requirements.txt @@ -0,0 +1,6 @@ +PyMuPDF>=1.24 +openpyxl>=3.1 +pandas>=2.0 +pytesseract>=0.3 +python-docx>=1.1 +Pillow>=10.0 diff --git a/.claude/skills/kb/templates/MASTER-INDEX.md b/.claude/skills/kb/templates/MASTER-INDEX.md new file mode 100644 index 0000000..3ab4e99 --- /dev/null +++ b/.claude/skills/kb/templates/MASTER-INDEX.md @@ -0,0 +1,4 @@ +# Master Index + +| 路径 | 类型 | æ‘˜è¦ | +|------|------|------| diff --git a/.claude/skills/kb/templates/ONTOLOGY.md b/.claude/skills/kb/templates/ONTOLOGY.md new file mode 100644 index 0000000..d3a255b --- /dev/null +++ b/.claude/skills/kb/templates/ONTOLOGY.md @@ -0,0 +1,50 @@ +# Ontology + +## 实体类型 + +| 类型 | 目录 | 命å规则 | 说明 | +|------|------|---------|------| +| concept | wiki/concepts/ | {slug}.md | 核心概念 | +| source | wiki/sources/ | {slug}.md | æ¥æºæ‘˜è¦ | +| comparison | wiki/comparisons/ | {a}-vs-{b}.md | å¯¹æ¯”åˆ†æž | + +## 关系 + +- 用 `[[åŒé“¾]]` 表达引用关系 +- frontmatter çš„ `compiled_from` è¡¨è¾¾æº¯æº +- frontmatter çš„ `related` è¡¨è¾¾å…³è” + +## Wiki æ–‡ç« æ¨¡æ¿ + +æ¯ç¯‡ wiki 文章使用以下结构: + +```yaml +--- +type: {entity_type} +id: {slug} +aliases: [] +compiled_from: + - raw/{source_file} +related: + - "[[other-article]]" +last_compiled: {date} +--- +``` + +### 正文结构 + +```markdown +# {标题} + +## 概述 +一段è¯å®šä¹‰... + +## è¦ç‚¹ +- ... + +## å…³è” +- [[相关概念]] + +## æ¥æº +- 编译自 raw/xxx.pdf +``` diff --git a/.claude/skills/kb/templates/RAW-REGISTRY.md b/.claude/skills/kb/templates/RAW-REGISTRY.md new file mode 100644 index 0000000..936b460 --- /dev/null +++ b/.claude/skills/kb/templates/RAW-REGISTRY.md @@ -0,0 +1,4 @@ +# Raw Registry + +| 文件 | 类型 | æ‘˜è¦ | çŠ¶æ€ | 编译产物 | +|------|------|------|------|---------| diff --git a/.claude/skills/kb/templates/TOPIC-MAP.md b/.claude/skills/kb/templates/TOPIC-MAP.md new file mode 100644 index 0000000..acafe1f --- /dev/null +++ b/.claude/skills/kb/templates/TOPIC-MAP.md @@ -0,0 +1,3 @@ +# Topic Map + + diff --git a/.claudian/sessions/conv-1776074446367-y9l6jom6z.meta.json b/.claudian/sessions/conv-1776074446367-y9l6jom6z.meta.json new file mode 100644 index 0000000..53967ce --- /dev/null +++ b/.claudian/sessions/conv-1776074446367-y9l6jom6z.meta.json @@ -0,0 +1,23 @@ +{ + "id": "conv-1776074446367-y9l6jom6z", + "providerId": "claude", + "title": "Start conversation", + "titleGenerationStatus": "success", + "createdAt": 1776074446367, + "updatedAt": 1776074479191, + "lastResponseAt": 1776074479191, + "sessionId": "bc8edeba-7a77-4523-8e79-95b84004035b", + "providerState": { + "providerSessionId": "bc8edeba-7a77-4523-8e79-95b84004035b" + }, + "usage": { + "model": "haiku", + "inputTokens": 163, + "cacheCreationInputTokens": 21877, + "cacheReadInputTokens": 0, + "contextWindow": 200000, + "contextTokens": 22040, + "percentage": 11, + "contextWindowIsAuthoritative": true + } +} \ No newline at end of file diff --git a/.obsidian/community-plugins.json b/.obsidian/community-plugins.json index d3f66fa..e5897c5 100644 --- a/.obsidian/community-plugins.json +++ b/.obsidian/community-plugins.json @@ -1,3 +1,4 @@ [ - "obsidian-git" + "obsidian-git", + "claudian" ] \ No newline at end of file diff --git a/.obsidian/plugins/claudian/data.json b/.obsidian/plugins/claudian/data.json new file mode 100644 index 0000000..fc829cb --- /dev/null +++ b/.obsidian/plugins/claudian/data.json @@ -0,0 +1,11 @@ +{ + "tabManagerState": { + "openTabs": [ + { + "tabId": "tab-1776074442549-1h1fbx3", + "conversationId": "conv-1776074446367-y9l6jom6z" + } + ], + "activeTabId": "tab-1776074442549-1h1fbx3" + } +} \ No newline at end of file diff --git a/测试.md b/Karpathy 让 AI 自己管知识库,告别 RAG 幻觉?我åšäº†ä¸€ä¸ªå·¥å…·ï¼ŒæŠŠè¿™å¥—方法è½åœ°äº†.md similarity index 100% rename from 测试.md rename to Karpathy 让 AI 自己管知识库,告别 RAG 幻觉?我åšäº†ä¸€ä¸ªå·¥å…·ï¼ŒæŠŠè¿™å¥—方法è½åœ°äº†.md diff --git a/ok.md b/ok.md deleted file mode 100644 index 8b13789..0000000 --- a/ok.md +++ /dev/null @@ -1 +0,0 @@ - diff --git a/未命å.canvas b/未命å.canvas deleted file mode 100644 index 9e26dfe..0000000 --- a/未命å.canvas +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file