Skip to content

Commit 022523a

Browse files
committed
feat: update pdf-brain tool for latest CLI
- Add URL support for add command (PDFs and Markdown from URLs) - Add Markdown file support (.md, .markdown) - Add --expand param to search (context expansion up to 4000 chars) - Add repair command (fix orphaned chunks/embeddings) - Add export/import commands (backup/restore library) - Add migrate command (--check, --import, --generate-script) - Update batch_add to handle both PDFs and Markdown - Use direct pdf-brain CLI instead of bunx wrapper
1 parent fa87a27 commit 022523a

2 files changed

Lines changed: 136 additions & 46 deletions

File tree

AGENTS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ The `opencode-swarm-plugin` provides type-safe, context-preserving wrappers. Alw
135135
- **pkg-scripts** - List package.json scripts
136136
- **repo-crawl\_\*** - GitHub API repo exploration
137137
- **repo-autopsy\_\*** - Clone & deep analyze repos locally
138-
- **pdf-brain\_\*** - PDF knowledge base
138+
- **pdf-brain\_\*** - PDF & Markdown knowledge base (supports URLs, `--expand` for context)
139139
- **ubs\_\*** - Multi-language bug scanner
140140

141141
### DEPRECATED - Do Not Use Directly

tool/pdf-brain.ts

Lines changed: 135 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import { tool } from "@opencode-ai/plugin";
22
import { existsSync } from "fs";
3-
import { join, basename } from "path";
3+
import { join, basename, extname } from "path";
44
import { spawn } from "child_process";
55

66
/**
7-
* PDF Brain - Local PDF knowledge base with vector search
7+
* PDF Brain - Local knowledge base with vector search
88
*
9+
* Supports PDFs and Markdown files (local paths or URLs).
910
* Uses PGlite + pgvector for semantic search via Ollama embeddings.
1011
* Stores in ~/Documents/.pdf-library/ for iCloud sync.
1112
*/
@@ -19,8 +20,7 @@ async function runCli(
1920
signal?: AbortSignal,
2021
): Promise<string> {
2122
return new Promise((resolve) => {
22-
// Use bunx for faster execution than npx (no registry check if cached)
23-
const proc = spawn("bunx", ["pdf-brain", ...args], {
23+
const proc = spawn("pdf-brain", args, {
2424
env: { ...process.env },
2525
stdio: ["ignore", "pipe", "pipe"],
2626
});
@@ -75,31 +75,48 @@ async function runCli(
7575
});
7676
}
7777

78+
function isUrl(str: string): boolean {
79+
return str.startsWith("http://") || str.startsWith("https://");
80+
}
81+
82+
function isValidFile(path: string): boolean {
83+
const ext = extname(path).toLowerCase();
84+
return ext === ".pdf" || ext === ".md" || ext === ".markdown";
85+
}
86+
7887
export const add = tool({
7988
description:
80-
"Add a PDF to the library - extracts text, generates embeddings for semantic search",
89+
"Add a PDF or Markdown file to the library - extracts text, generates embeddings for semantic search. Supports local paths and URLs.",
8190
args: {
82-
path: tool.schema.string().describe("Path to PDF file"),
91+
path: tool.schema.string().describe("Path to file (PDF/Markdown) or URL"),
8392
tags: tool.schema.string().optional().describe("Comma-separated tags"),
8493
title: tool.schema
8594
.string()
8695
.optional()
87-
.describe("Custom title (default: filename)"),
96+
.describe("Custom title (default: filename or frontmatter)"),
8897
},
89-
async execute({ path: pdfPath, tags, title }, ctx) {
90-
// Resolve path
91-
const resolvedPath = pdfPath.startsWith("~")
92-
? pdfPath.replace("~", process.env.HOME || "")
93-
: pdfPath.startsWith("/")
94-
? pdfPath
95-
: join(process.cwd(), pdfPath);
98+
async execute({ path: filePath, tags, title }, ctx) {
99+
// Handle URLs directly
100+
if (isUrl(filePath)) {
101+
const args = ["add", filePath];
102+
if (tags) args.push("--tags", tags);
103+
if (title) args.push("--title", title);
104+
return runCli(args, EMBEDDING_TIMEOUT_MS, ctx?.abort);
105+
}
106+
107+
// Resolve local path
108+
const resolvedPath = filePath.startsWith("~")
109+
? filePath.replace("~", process.env.HOME || "")
110+
: filePath.startsWith("/")
111+
? filePath
112+
: join(process.cwd(), filePath);
96113

97114
if (!existsSync(resolvedPath)) {
98115
return `File not found: ${resolvedPath}`;
99116
}
100117

101-
if (!resolvedPath.toLowerCase().endsWith(".pdf")) {
102-
return "Not a PDF file";
118+
if (!isValidFile(resolvedPath)) {
119+
return "Unsupported file type. Use PDF or Markdown files.";
103120
}
104121

105122
const args = ["add", resolvedPath];
@@ -113,7 +130,7 @@ export const add = tool({
113130

114131
export const search = tool({
115132
description:
116-
"Semantic search across all PDFs using vector similarity (requires Ollama)",
133+
"Semantic search across all documents using vector similarity (requires Ollama)",
117134
args: {
118135
query: tool.schema.string().describe("Natural language search query"),
119136
limit: tool.schema
@@ -124,31 +141,36 @@ export const search = tool({
124141
fts: tool.schema
125142
.boolean()
126143
.optional()
127-
.describe("Use full-text search only (no embeddings)"),
144+
.describe("Use full-text search only (skip embeddings)"),
145+
expand: tool.schema
146+
.number()
147+
.optional()
148+
.describe("Expand context around matches (max: 4000 chars)"),
128149
},
129-
async execute({ query, limit, tag, fts }, ctx) {
150+
async execute({ query, limit, tag, fts, expand }, ctx) {
130151
const args = ["search", query];
131152
if (limit) args.push("--limit", String(limit));
132153
if (tag) args.push("--tag", tag);
133154
if (fts) args.push("--fts");
155+
if (expand) args.push("--expand", String(Math.min(expand, 4000)));
134156

135157
// Vector search needs Ollama for query embedding (unless fts-only)
136158
return runCli(args, fts ? DEFAULT_TIMEOUT_MS : 60_000, ctx?.abort);
137159
},
138160
});
139161

140162
export const read = tool({
141-
description: "Get details about a specific PDF in the library",
163+
description: "Get document details and metadata",
142164
args: {
143-
query: tool.schema.string().describe("PDF ID or title"),
165+
query: tool.schema.string().describe("Document ID or title"),
144166
},
145167
async execute({ query }, ctx) {
146-
return runCli(["get", query], DEFAULT_TIMEOUT_MS, ctx?.abort);
168+
return runCli(["read", query], DEFAULT_TIMEOUT_MS, ctx?.abort);
147169
},
148170
});
149171

150172
export const list = tool({
151-
description: "List all PDFs in the library",
173+
description: "List all documents in the library",
152174
args: {
153175
tag: tool.schema.string().optional().describe("Filter by tag"),
154176
},
@@ -160,19 +182,19 @@ export const list = tool({
160182
});
161183

162184
export const remove = tool({
163-
description: "Remove a PDF from the library",
185+
description: "Remove a document from the library",
164186
args: {
165-
query: tool.schema.string().describe("PDF ID or title to remove"),
187+
query: tool.schema.string().describe("Document ID or title to remove"),
166188
},
167189
async execute({ query }, ctx) {
168190
return runCli(["remove", query], DEFAULT_TIMEOUT_MS, ctx?.abort);
169191
},
170192
});
171193

172194
export const tag = tool({
173-
description: "Set tags on a PDF",
195+
description: "Set tags on a document",
174196
args: {
175-
query: tool.schema.string().describe("PDF ID or title"),
197+
query: tool.schema.string().describe("Document ID or title"),
176198
tags: tool.schema.string().describe("Comma-separated tags to set"),
177199
},
178200
async execute({ query, tags }, ctx) {
@@ -196,10 +218,81 @@ export const check = tool({
196218
},
197219
});
198220

221+
export const repair = tool({
222+
description:
223+
"Fix database integrity issues - removes orphaned chunks/embeddings",
224+
args: {},
225+
async execute(_args, ctx) {
226+
return runCli(["repair"], DEFAULT_TIMEOUT_MS, ctx?.abort);
227+
},
228+
});
229+
230+
export const exportLib = tool({
231+
description: "Export library database for backup or sharing",
232+
args: {
233+
output: tool.schema
234+
.string()
235+
.optional()
236+
.describe("Output file path (default: ./pdf-brain-export.tar.gz)"),
237+
},
238+
async execute({ output }, ctx) {
239+
const args = ["export"];
240+
if (output) args.push("--output", output);
241+
return runCli(args, 60_000, ctx?.abort);
242+
},
243+
});
244+
245+
export const importLib = tool({
246+
description: "Import library database from export archive",
247+
args: {
248+
file: tool.schema.string().describe("Path to export archive"),
249+
force: tool.schema
250+
.boolean()
251+
.optional()
252+
.describe("Overwrite existing library"),
253+
},
254+
async execute({ file, force }, ctx) {
255+
const args = ["import", file];
256+
if (force) args.push("--force");
257+
return runCli(args, 60_000, ctx?.abort);
258+
},
259+
});
260+
261+
export const migrate = tool({
262+
description: "Database migration utilities",
263+
args: {
264+
check: tool.schema
265+
.boolean()
266+
.optional()
267+
.describe("Check if migration is needed"),
268+
importFile: tool.schema
269+
.string()
270+
.optional()
271+
.describe("Import from SQL dump file"),
272+
generateScript: tool.schema
273+
.boolean()
274+
.optional()
275+
.describe("Generate export script for current database"),
276+
},
277+
async execute({ check, importFile, generateScript }, ctx) {
278+
const args = ["migrate"];
279+
if (check) args.push("--check");
280+
if (importFile) args.push("--import", importFile);
281+
if (generateScript) args.push("--generate-script");
282+
283+
// If no flags, just run migrate (shows help)
284+
if (!check && !importFile && !generateScript) {
285+
args.push("--check");
286+
}
287+
288+
return runCli(args, 60_000, ctx?.abort);
289+
},
290+
});
291+
199292
export const batch_add = tool({
200-
description: "Add multiple PDFs from a directory",
293+
description: "Add multiple PDFs/Markdown files from a directory",
201294
args: {
202-
dir: tool.schema.string().describe("Directory containing PDFs"),
295+
dir: tool.schema.string().describe("Directory containing documents"),
203296
tags: tool.schema.string().optional().describe("Tags to apply to all"),
204297
recursive: tool.schema
205298
.boolean()
@@ -217,43 +310,40 @@ export const batch_add = tool({
217310
return `Directory not found: ${resolvedDir}`;
218311
}
219312

220-
// Find PDFs
221-
const { readdirSync, statSync } = await import("fs");
313+
// Find documents
314+
const { readdirSync } = await import("fs");
222315

223-
function findPdfs(dir: string, recurse: boolean): string[] {
316+
function findDocs(dir: string, recurse: boolean): string[] {
224317
const results: string[] = [];
225318
for (const entry of readdirSync(dir, { withFileTypes: true })) {
226319
const fullPath = join(dir, entry.name);
227320
if (entry.isDirectory() && recurse) {
228-
results.push(...findPdfs(fullPath, true));
229-
} else if (
230-
entry.isFile() &&
231-
entry.name.toLowerCase().endsWith(".pdf")
232-
) {
321+
results.push(...findDocs(fullPath, true));
322+
} else if (entry.isFile() && isValidFile(entry.name)) {
233323
results.push(fullPath);
234324
}
235325
}
236326
return results;
237327
}
238328

239-
const pdfList = findPdfs(resolvedDir, recursive);
329+
const docList = findDocs(resolvedDir, recursive);
240330

241-
if (pdfList.length === 0) {
242-
return `No PDFs found in ${resolvedDir}`;
331+
if (docList.length === 0) {
332+
return `No PDF or Markdown files found in ${resolvedDir}`;
243333
}
244334

245335
const results: string[] = [];
246336

247-
for (const pdfPath of pdfList) {
337+
for (const docPath of docList) {
248338
// Check for abort between iterations
249339
if (ctx?.abort?.aborted) {
250-
results.push("\n\nOperation cancelled - remaining PDFs not processed");
340+
results.push("\n\nOperation cancelled - remaining files not processed");
251341
break;
252342
}
253343

254-
const title = basename(pdfPath, ".pdf");
344+
const title = basename(docPath, extname(docPath));
255345
try {
256-
const args = ["add", pdfPath];
346+
const args = ["add", docPath];
257347
if (tags) args.push("--tags", tags);
258348

259349
const result = await runCli(args, EMBEDDING_TIMEOUT_MS, ctx?.abort);
@@ -267,6 +357,6 @@ export const batch_add = tool({
267357
}
268358
}
269359

270-
return `# Batch Add Results (${pdfList.length} PDFs)\n\n${results.join("\n")}`;
360+
return `# Batch Add Results (${docList.length} documents)\n\n${results.join("\n")}`;
271361
},
272362
});

0 commit comments

Comments
 (0)