Skip to content

Commit 708f298

Browse files
authored
feat: replace forSession() scoring with FTS5 BM25 (#48)
## Phase 3 of search improvements (depends on #47) Replaces the coarse bag-of-words term-overlap scoring in `forSession()` with FTS5 BM25-based scoring. ### Problem `forSession()` used manual term-overlap counting: extract top 30 words >3 chars, count how many appear in each entry via `string.includes()`. This ignored: - Porter stemming ("configure" wouldn't match "configuration") - TF-IDF weighting (all matching terms counted equally) - Stopwords (common words inflated match counts) ### Solution **New `scoreEntriesFTS()`** in ltm.ts: - Runs session context terms against `knowledge_fts` using BM25 - Uses **OR** semantics (not AND-then-OR) because we're scoring all candidates for ranking, not searching for exact matches — an entry matching 1 of 40 terms should get a low score, not be excluded - BM25 naturally weights entries matching more terms higher - Scores normalized to 0–1 and multiplied by entry confidence **Improved `extractTopTerms()`** moved to `search.ts`: - Now uses same STOPWORDS set from Phase 1 - Drops single chars only (not >3 char threshold) — preserves "DB", "CI", "IO" - Increased limit from 30 to 40 terms ### Safety net preserved Top 5 project entries by confidence are always included regardless of FTS match, preventing the scoring change from accidentally excluding critical project knowledge. ### Test coverage - 8 new tests for `extractTopTerms()` (stopwords, 2-char tokens, limits, punctuation) - All 12 existing `forSession()` tests continue to pass
1 parent 332f7b2 commit 708f298

3 files changed

Lines changed: 159 additions & 45 deletions

File tree

src/ltm.ts

Lines changed: 64 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { uuidv7 } from "uuidv7";
22
import { db, ensureProject } from "./db";
3-
import { ftsQuery, ftsQueryOr, EMPTY_QUERY } from "./search";
3+
import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms } from "./search";
44

55
// ~3 chars per token — validated as best heuristic against real API data.
66
function estimateTokens(text: string): number {
@@ -153,6 +153,9 @@ export function forProject(
153153

154154
type Scored = { entry: KnowledgeEntry; score: number };
155155

156+
/** BM25 column weights for knowledge_fts: title, content, category. */
157+
const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 };
158+
156159
/** Max entries per pool to include on first turn when no session context exists. */
157160
const NO_CONTEXT_FALLBACK_CAP = 10;
158161

@@ -163,43 +166,53 @@ const NO_CONTEXT_FALLBACK_CAP = 10;
163166
const PROJECT_SAFETY_NET = 5;
164167

165168
/**
166-
* Score entries by term overlap with session context.
167-
* Returns score = (fraction of topTerms matched) * entry.confidence.
169+
* Score entries by FTS5 BM25 relevance to session context.
170+
*
171+
* Uses OR semantics (not AND-then-OR) because we're scoring ALL candidates
172+
* for relevance ranking, not searching for exact matches. An entry that
173+
* matches 1 of 40 terms should still get a (low) score, not be excluded.
174+
* BM25 naturally weights entries matching more terms higher.
175+
*
176+
* Returns a Map of entry ID → normalized score (0–1).
168177
*/
169-
function scoreEntries(
170-
entries: KnowledgeEntry[],
171-
topTerms: string[],
172-
): Scored[] {
173-
return entries.map((entry) => {
174-
const haystack =
175-
(entry.title + " " + entry.content).replace(/[^\w\s]/g, " ").toLowerCase();
176-
let hits = 0;
177-
for (const term of topTerms) {
178-
if (haystack.includes(term)) hits++;
179-
}
180-
const relevance = topTerms.length > 0 ? hits / topTerms.length : 0;
181-
return { entry, score: relevance * entry.confidence };
182-
});
183-
}
178+
function scoreEntriesFTS(sessionContext: string): Map<string, number> {
179+
const terms = extractTopTerms(sessionContext);
180+
if (!terms.length) return new Map();
184181

185-
/**
186-
* Extract the top 30 meaningful terms (>3 chars) from text, sorted by frequency.
187-
*/
188-
function extractTopTerms(text: string): string[] {
189-
const freq = text
190-
.replace(/[^\w\s]/g, " ")
191-
.toLowerCase()
192-
.split(/\s+/)
193-
.filter((w) => w.length > 3)
194-
.reduce<Map<string, number>>((acc, w) => {
195-
acc.set(w, (acc.get(w) ?? 0) + 1);
196-
return acc;
197-
}, new Map());
198-
199-
return [...freq.entries()]
200-
.sort((a, b) => b[1] - a[1])
201-
.slice(0, 30)
202-
.map(([w]) => w);
182+
const q = terms.map((t) => `${t}*`).join(" OR ");
183+
const { title, content, category } = FTS_WEIGHTS;
184+
185+
try {
186+
const results = db()
187+
.query(
188+
`SELECT k.id, bm25(knowledge_fts, ?, ?, ?) as rank
189+
FROM knowledge k
190+
JOIN knowledge_fts f ON k.rowid = f.rowid
191+
WHERE knowledge_fts MATCH ?
192+
AND k.confidence > 0.2`,
193+
)
194+
.all(title, content, category, q) as Array<{
195+
id: string;
196+
rank: number;
197+
}>;
198+
199+
if (!results.length) return new Map();
200+
201+
// Normalize: BM25 rank is negative (more negative = better).
202+
// Convert to 0–1 where 1 = best match.
203+
const ranks = results.map((r) => r.rank);
204+
const minRank = Math.min(...ranks);
205+
const maxRank = Math.max(...ranks);
206+
const scoreMap = new Map<string, number>();
207+
for (const r of results) {
208+
const norm =
209+
minRank === maxRank ? 1 : (maxRank - r.rank) / (maxRank - minRank);
210+
scoreMap.set(r.id, norm);
211+
}
212+
return scoreMap;
213+
} catch {
214+
return new Map();
215+
}
203216
}
204217

205218
/**
@@ -279,10 +292,14 @@ export function forSession(
279292
let scoredCross: Scored[];
280293

281294
if (sessionContext.trim().length > 20) {
282-
const topTerms = extractTopTerms(sessionContext);
283-
284-
// Score project entries — include matched + safety net of top-N by confidence
285-
const rawScored = scoreEntries(projectEntries, topTerms);
295+
// Use FTS5 BM25 to score all knowledge entries against session context
296+
const ftsScores = scoreEntriesFTS(sessionContext);
297+
298+
// Score project entries: FTS relevance × confidence, with safety net
299+
const rawScored: Scored[] = projectEntries.map((entry) => ({
300+
entry,
301+
score: (ftsScores.get(entry.id) ?? 0) * entry.confidence,
302+
}));
286303
const matched = rawScored.filter((s) => s.score > 0);
287304
const matchedIds = new Set(matched.map((s) => s.entry.id));
288305

@@ -295,8 +312,13 @@ export function forSession(
295312

296313
scoredProject = [...matched, ...safetyNet];
297314

298-
// Score cross-project entries — only include entries with at least one term match
299-
scoredCross = scoreEntries(crossEntries, topTerms).filter((s) => s.score > 0);
315+
// Score cross-project entries — only include entries with FTS match
316+
scoredCross = crossEntries
317+
.filter((e) => ftsScores.has(e.id))
318+
.map((e) => ({
319+
entry: e,
320+
score: (ftsScores.get(e.id) ?? 0) * e.confidence,
321+
}));
300322
} else {
301323
// No session context — fall back to top entries by confidence, capped
302324
scoredProject = projectEntries
@@ -364,9 +386,6 @@ function searchLike(input: {
364386
.all(...likeParams, input.limit) as KnowledgeEntry[];
365387
}
366388

367-
/** BM25 column weights for knowledge_fts: title, content, category. */
368-
const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 };
369-
370389
export function search(input: {
371390
query: string;
372391
projectPath?: string;

src/search.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,38 @@ export function ftsQueryOr(raw: string): string {
173173
return terms.map((w) => `${w}*`).join(" OR ");
174174
}
175175

176+
// ---------------------------------------------------------------------------
177+
// Term extraction (Phase 3)
178+
// ---------------------------------------------------------------------------
179+
180+
/**
181+
* Extract the top meaningful terms from text, sorted by frequency.
182+
*
183+
* Same filtering as ftsQuery: drops single chars + stopwords.
184+
* No general length threshold — preserves short meaningful tokens like "DB", "CI".
185+
*
186+
* Used by forSession() to build session context queries for FTS5 scoring.
187+
*
188+
* @param text Raw text to extract terms from
189+
* @param limit Max number of terms to return (default 40)
190+
*/
191+
export function extractTopTerms(text: string, limit = 40): string[] {
192+
const freq = text
193+
.replace(/[^\w\s]/g, " ")
194+
.toLowerCase()
195+
.split(/\s+/)
196+
.filter((w) => w.length > 1 && !STOPWORDS.has(w))
197+
.reduce<Map<string, number>>((acc, w) => {
198+
acc.set(w, (acc.get(w) ?? 0) + 1);
199+
return acc;
200+
}, new Map());
201+
202+
return [...freq.entries()]
203+
.sort((a, b) => b[1] - a[1])
204+
.slice(0, limit)
205+
.map(([w]) => w);
206+
}
207+
176208
// ---------------------------------------------------------------------------
177209
// Score normalization & fusion (Phase 2)
178210
// ---------------------------------------------------------------------------

test/search.test.ts

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
EMPTY_QUERY,
77
normalizeRank,
88
reciprocalRankFusion,
9+
extractTopTerms,
910
} from "../src/search";
1011

1112
describe("search", () => {
@@ -258,4 +259,66 @@ describe("search", () => {
258259
expect(fused[0].score).toBeCloseTo(0.1, 4);
259260
});
260261
});
262+
263+
describe("extractTopTerms", () => {
264+
test("extracts terms sorted by frequency", () => {
265+
const terms = extractTopTerms("database database database config config");
266+
expect(terms[0]).toBe("database");
267+
expect(terms[1]).toBe("config");
268+
});
269+
270+
test("filters stopwords", () => {
271+
const terms = extractTopTerms("the database with the indexes from the table");
272+
expect(terms).toContain("database");
273+
expect(terms).toContain("indexes");
274+
expect(terms).toContain("table");
275+
expect(terms).not.toContain("the");
276+
expect(terms).not.toContain("with");
277+
expect(terms).not.toContain("from");
278+
});
279+
280+
test("filters single chars", () => {
281+
const terms = extractTopTerms("I found a bug in x module");
282+
expect(terms).toContain("found");
283+
expect(terms).toContain("bug");
284+
expect(terms).toContain("module");
285+
expect(terms).not.toContain("I");
286+
expect(terms).not.toContain("a");
287+
expect(terms).not.toContain("x");
288+
});
289+
290+
test("preserves 2-char tokens like DB, CI, IO", () => {
291+
const terms = extractTopTerms("check DB and CI pipeline for IO errors");
292+
expect(terms).toContain("db"); // lowercased
293+
expect(terms).toContain("ci");
294+
expect(terms).toContain("io");
295+
});
296+
297+
test("respects limit parameter", () => {
298+
const text = "alpha bravo charlie delta echo foxtrot golf hotel india juliet";
299+
const terms = extractTopTerms(text, 3);
300+
expect(terms.length).toBe(3);
301+
});
302+
303+
test("default limit is 40", () => {
304+
// Generate 50 unique words
305+
const words = Array.from({ length: 50 }, (_, i) => `word${i}`);
306+
const text = words.join(" ");
307+
const terms = extractTopTerms(text);
308+
expect(terms.length).toBe(40);
309+
});
310+
311+
test("returns empty for all-stopword text", () => {
312+
const terms = extractTopTerms("the with from is at by in");
313+
expect(terms.length).toBe(0);
314+
});
315+
316+
test("strips punctuation before processing", () => {
317+
const terms = extractTopTerms("what's happening? database-migration!");
318+
expect(terms).toContain("happening");
319+
expect(terms).toContain("database");
320+
expect(terms).toContain("migration");
321+
expect(terms).not.toContain("what"); // stopword
322+
});
323+
});
261324
});

0 commit comments

Comments
 (0)