feat: replace forSession() scoring with FTS5 BM25 (#48)

BYK · web-flow · commit 708f298fa3bd · 2026-03-22T21:49:14.000Z
## Phase 3 of search improvements (depends on #47) Replaces the coarse bag-of-words term-overlap scoring in `forSession()` with FTS5 BM25-based scoring. ### Problem `forSession()` used manual term-overlap counting: extract top 30 words >3 chars, count how many appear in each entry via `string.includes()`. This ignored: - Porter stemming ("configure" wouldn't match "configuration") - TF-IDF weighting (all matching terms counted equally) - Stopwords (common words inflated match counts) ### Solution **New `scoreEntriesFTS()`** in ltm.ts: - Runs session context terms against `knowledge_fts` using BM25 - Uses **OR** semantics (not AND-then-OR) because we're scoring all candidates for ranking, not searching for exact matches — an entry matching 1 of 40 terms should get a low score, not be excluded - BM25 naturally weights entries matching more terms higher - Scores normalized to 0–1 and multiplied by entry confidence **Improved `extractTopTerms()`** moved to `search.ts`: - Now uses same STOPWORDS set from Phase 1 - Drops single chars only (not >3 char threshold) — preserves "DB", "CI", "IO" - Increased limit from 30 to 40 terms ### Safety net preserved Top 5 project entries by confidence are always included regardless of FTS match, preventing the scoring change from accidentally excluding critical project knowledge. ### Test coverage - 8 new tests for `extractTopTerms()` (stopwords, 2-char tokens, limits, punctuation) - All 12 existing `forSession()` tests continue to pass
diff --git a/src/ltm.ts b/src/ltm.ts
@@ -1,6 +1,6 @@
 import { uuidv7 } from "uuidv7";
 import { db, ensureProject } from "./db";
-import { ftsQuery, ftsQueryOr, EMPTY_QUERY } from "./search";
+import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms } from "./search";
 
 // ~3 chars per token — validated as best heuristic against real API data.
 function estimateTokens(text: string): number {
@@ -153,6 +153,9 @@ export function forProject(
 
 type Scored = { entry: KnowledgeEntry; score: number };
 
+/** BM25 column weights for knowledge_fts: title, content, category. */
+const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 };
+
 /** Max entries per pool to include on first turn when no session context exists. */
 const NO_CONTEXT_FALLBACK_CAP = 10;
 
@@ -163,43 +166,53 @@ const NO_CONTEXT_FALLBACK_CAP = 10;
 const PROJECT_SAFETY_NET = 5;
 
 /**
- * Score entries by term overlap with session context.
- * Returns score = (fraction of topTerms matched) * entry.confidence.
+ * Score entries by FTS5 BM25 relevance to session context.
+ *
+ * Uses OR semantics (not AND-then-OR) because we're scoring ALL candidates
+ * for relevance ranking, not searching for exact matches. An entry that
+ * matches 1 of 40 terms should still get a (low) score, not be excluded.
+ * BM25 naturally weights entries matching more terms higher.
+ *
+ * Returns a Map of entry ID → normalized score (0–1).
  */
-function scoreEntries(
-  entries: KnowledgeEntry[],
-  topTerms: string[],
-): Scored[] {
-  return entries.map((entry) => {
-    const haystack =
-      (entry.title + " " + entry.content).replace(/[^\w\s]/g, " ").toLowerCase();
-    let hits = 0;
-    for (const term of topTerms) {
-      if (haystack.includes(term)) hits++;
-    }
-    const relevance = topTerms.length > 0 ? hits / topTerms.length : 0;
-    return { entry, score: relevance * entry.confidence };
-  });
-}
+function scoreEntriesFTS(sessionContext: string): Map<string, number> {
+  const terms = extractTopTerms(sessionContext);
+  if (!terms.length) return new Map();
 
-/**
- * Extract the top 30 meaningful terms (>3 chars) from text, sorted by frequency.
- */
-function extractTopTerms(text: string): string[] {
-  const freq = text
-    .replace(/[^\w\s]/g, " ")
-    .toLowerCase()
-    .split(/\s+/)
-    .filter((w) => w.length > 3)
-    .reduce<Map<string, number>>((acc, w) => {
-      acc.set(w, (acc.get(w) ?? 0) + 1);
-      return acc;
-    }, new Map());
-
-  return [...freq.entries()]
-    .sort((a, b) => b[1] - a[1])
-    .slice(0, 30)
-    .map(([w]) => w);
+  const q = terms.map((t) => `${t}*`).join(" OR ");
+  const { title, content, category } = FTS_WEIGHTS;
+
+  try {
+    const results = db()
+      .query(
+        `SELECT k.id, bm25(knowledge_fts, ?, ?, ?) as rank
+         FROM knowledge k
+         JOIN knowledge_fts f ON k.rowid = f.rowid
+         WHERE knowledge_fts MATCH ?
+         AND k.confidence > 0.2`,
+      )
+      .all(title, content, category, q) as Array<{
+      id: string;
+      rank: number;
+    }>;
+
+    if (!results.length) return new Map();
+
+    // Normalize: BM25 rank is negative (more negative = better).
+    // Convert to 0–1 where 1 = best match.
+    const ranks = results.map((r) => r.rank);
+    const minRank = Math.min(...ranks);
+    const maxRank = Math.max(...ranks);
+    const scoreMap = new Map<string, number>();
+    for (const r of results) {
+      const norm =
+        minRank === maxRank ? 1 : (maxRank - r.rank) / (maxRank - minRank);
+      scoreMap.set(r.id, norm);
+    }
+    return scoreMap;
+  } catch {
+    return new Map();
+  }
 }
 
 /**
@@ -279,10 +292,14 @@ export function forSession(
   let scoredCross: Scored[];
 
   if (sessionContext.trim().length > 20) {
-    const topTerms = extractTopTerms(sessionContext);
-
-    // Score project entries — include matched + safety net of top-N by confidence
-    const rawScored = scoreEntries(projectEntries, topTerms);
+    // Use FTS5 BM25 to score all knowledge entries against session context
+    const ftsScores = scoreEntriesFTS(sessionContext);
+
+    // Score project entries: FTS relevance × confidence, with safety net
+    const rawScored: Scored[] = projectEntries.map((entry) => ({
+      entry,
+      score: (ftsScores.get(entry.id) ?? 0) * entry.confidence,
+    }));
     const matched = rawScored.filter((s) => s.score > 0);
     const matchedIds = new Set(matched.map((s) => s.entry.id));
 
@@ -295,8 +312,13 @@ export function forSession(
 
     scoredProject = [...matched, ...safetyNet];
 
-    // Score cross-project entries — only include entries with at least one term match
-    scoredCross = scoreEntries(crossEntries, topTerms).filter((s) => s.score > 0);
+    // Score cross-project entries — only include entries with FTS match
+    scoredCross = crossEntries
+      .filter((e) => ftsScores.has(e.id))
+      .map((e) => ({
+        entry: e,
+        score: (ftsScores.get(e.id) ?? 0) * e.confidence,
+      }));
   } else {
     // No session context — fall back to top entries by confidence, capped
     scoredProject = projectEntries
@@ -364,9 +386,6 @@ function searchLike(input: {
     .all(...likeParams, input.limit) as KnowledgeEntry[];
 }
 
-/** BM25 column weights for knowledge_fts: title, content, category. */
-const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 };
-
 export function search(input: {
   query: string;
   projectPath?: string;
diff --git a/src/search.ts b/src/search.ts
@@ -173,6 +173,38 @@ export function ftsQueryOr(raw: string): string {
   return terms.map((w) => `${w}*`).join(" OR ");
 }
 
+// ---------------------------------------------------------------------------
+// Term extraction (Phase 3)
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract the top meaningful terms from text, sorted by frequency.
+ *
+ * Same filtering as ftsQuery: drops single chars + stopwords.
+ * No general length threshold — preserves short meaningful tokens like "DB", "CI".
+ *
+ * Used by forSession() to build session context queries for FTS5 scoring.
+ *
+ * @param text   Raw text to extract terms from
+ * @param limit  Max number of terms to return (default 40)
+ */
+export function extractTopTerms(text: string, limit = 40): string[] {
+  const freq = text
+    .replace(/[^\w\s]/g, " ")
+    .toLowerCase()
+    .split(/\s+/)
+    .filter((w) => w.length > 1 && !STOPWORDS.has(w))
+    .reduce<Map<string, number>>((acc, w) => {
+      acc.set(w, (acc.get(w) ?? 0) + 1);
+      return acc;
+    }, new Map());
+
+  return [...freq.entries()]
+    .sort((a, b) => b[1] - a[1])
+    .slice(0, limit)
+    .map(([w]) => w);
+}
+
 // ---------------------------------------------------------------------------
 // Score normalization & fusion (Phase 2)
 // ---------------------------------------------------------------------------
diff --git a/test/search.test.ts b/test/search.test.ts
@@ -6,6 +6,7 @@ import {
   EMPTY_QUERY,
   normalizeRank,
   reciprocalRankFusion,
+  extractTopTerms,
 } from "../src/search";
 
 describe("search", () => {
@@ -258,4 +259,66 @@ describe("search", () => {
       expect(fused[0].score).toBeCloseTo(0.1, 4);
     });
   });
+
+  describe("extractTopTerms", () => {
+    test("extracts terms sorted by frequency", () => {
+      const terms = extractTopTerms("database database database config config");
+      expect(terms[0]).toBe("database");
+      expect(terms[1]).toBe("config");
+    });
+
+    test("filters stopwords", () => {
+      const terms = extractTopTerms("the database with the indexes from the table");
+      expect(terms).toContain("database");
+      expect(terms).toContain("indexes");
+      expect(terms).toContain("table");
+      expect(terms).not.toContain("the");
+      expect(terms).not.toContain("with");
+      expect(terms).not.toContain("from");
+    });
+
+    test("filters single chars", () => {
+      const terms = extractTopTerms("I found a bug in x module");
+      expect(terms).toContain("found");
+      expect(terms).toContain("bug");
+      expect(terms).toContain("module");
+      expect(terms).not.toContain("I");
+      expect(terms).not.toContain("a");
+      expect(terms).not.toContain("x");
+    });
+
+    test("preserves 2-char tokens like DB, CI, IO", () => {
+      const terms = extractTopTerms("check DB and CI pipeline for IO errors");
+      expect(terms).toContain("db"); // lowercased
+      expect(terms).toContain("ci");
+      expect(terms).toContain("io");
+    });
+
+    test("respects limit parameter", () => {
+      const text = "alpha bravo charlie delta echo foxtrot golf hotel india juliet";
+      const terms = extractTopTerms(text, 3);
+      expect(terms.length).toBe(3);
+    });
+
+    test("default limit is 40", () => {
+      // Generate 50 unique words
+      const words = Array.from({ length: 50 }, (_, i) => `word${i}`);
+      const text = words.join(" ");
+      const terms = extractTopTerms(text);
+      expect(terms.length).toBe(40);
+    });
+
+    test("returns empty for all-stopword text", () => {
+      const terms = extractTopTerms("the with from is at by in");
+      expect(terms.length).toBe(0);
+    });
+
+    test("strips punctuation before processing", () => {
+      const terms = extractTopTerms("what's happening? database-migration!");
+      expect(terms).toContain("happening");
+      expect(terms).toContain("database");
+      expect(terms).toContain("migration");
+      expect(terms).not.toContain("what"); // stopword
+    });
+  });
 });