-
Notifications
You must be signed in to change notification settings - Fork 820
SOLR-18194: fix nested docs detection false positive #4279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
0f6dccc
15f02bd
adb2f88
76cdb76
f4d0f2a
38703b7
36ec733
c20431c
ec91d99
7765e0f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| title: Fix nested docs detection false positive | ||
| type: changed | ||
| authors: | ||
| - name: Luke Kot-Zaniewski | ||
| links: | ||
| - name: SOLR-18194 | ||
| url: https://issues.apache.org/jira/browse/SOLR-18194 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -323,11 +323,11 @@ private void setMinVersionForSegments(SolrCore core, Set<String> segments, Versi | |
| private record SegmentLayout(String coreName, String seg1, String seg2, String seg3) {} | ||
|
|
||
| @Test | ||
| public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception { | ||
| public void testUpgradeCoreIndexFailsWithChildDocuments() throws Exception { | ||
| final SolrCore core = h.getCore(); | ||
| final String coreName = core.getName(); | ||
|
|
||
| // Create a parent document with a child document (nested doc) | ||
| // Create a parent document with a child document | ||
| SolrInputDocument parentDoc = new SolrInputDocument(); | ||
| parentDoc.addField("id", "100"); | ||
| parentDoc.addField("title", "Parent Document"); | ||
|
|
@@ -338,7 +338,7 @@ public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception { | |
|
|
||
| parentDoc.addChildDocument(childDoc); | ||
|
|
||
| // Index the nested document | ||
| // Index the parent+child document | ||
| try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, new ModifiableSolrParams())) { | ||
| AddUpdateCommand cmd = new AddUpdateCommand(req); | ||
| cmd.solrDoc = parentDoc; | ||
|
|
@@ -349,7 +349,7 @@ public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception { | |
| // Verify documents were indexed (parent + child = 2 docs) | ||
| assertQ(req("q", "*:*"), "//result[@numFound='2']"); | ||
|
|
||
| // Attempt to upgrade the index - should fail because of nested documents | ||
| // Attempt to upgrade the index - should fail because of child documents | ||
| CoreAdminHandler admin = new CoreAdminHandler(h.getCoreContainer()); | ||
| try { | ||
| final SolrQueryResponse resp = new SolrQueryResponse(); | ||
|
|
@@ -365,10 +365,192 @@ public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception { | |
| coreName), | ||
| resp)); | ||
|
|
||
| // Verify the exception message indicates nested documents are not supported | ||
| // Verify the exception message indicates child documents are not supported | ||
| assertThat( | ||
| thrown.getMessage(), | ||
| containsString("does not support indexes containing nested documents")); | ||
| containsString("does not support indexes containing child documents")); | ||
| } finally { | ||
| admin.shutdown(); | ||
| admin.close(); | ||
| } | ||
| } | ||
|
|
||
| // --- Child docs detection tests --- | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [0] Wow lots of tests... and they seem only slightly tweaked amongst each. My observation is that Lucene/Solr old-timers (like me) prefer to write a minimal number of randomized test instead of writing many repeated tests that tweak something. I noticed LLMs prefer the latter. But it's fine, really!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are indeed LLM generated but I was very intentional about what scenarios to test, i.e. hasChildDocs X hasUpdates X hasDeletes. These variables have different effects on the distinct count vs As an aside, is your reflexive urge to use randomized testing here motivated by aesthetic reasons? If so, I could consider parameterizing them. I think it should be possible.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for explaining why the tests are what they are. Seems perfect then :-) No; not aesthetic reasons. Well okay slightly but whatever. I love the idea of a utility function somewhere! SolrCore? I hate to suggest that place given SolrCore's size but it would at least have some logical sense to its placement there, and discoverability, which matters. |
||
| // | ||
| // These tests verify that the child document detection in the upgrade path | ||
| // correctly distinguishes between genuine child docs and non-child docs, | ||
| // even in the presence of updates and deletes that leave deleted documents | ||
| // in segments (since NoMergePolicy prevents segment merges from purging them). | ||
|
|
||
| @Test | ||
| public void testChildDocsDetection_noChildDocsJustAdd() throws Exception { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the intent that we are trying to be explicit about the edge cases of the child docs check. In certain cases however, I'd argue that certain tests like this one are effectively testing the happy path which is already getting covered inherently in existing tests like testNoUpgradeNeededWhenAllSegmentsCurrent(). I appreciate the thorough tests, but I think we have an opportunity to keep the coverage tight here by relying on certain existing tests like above. Another reasoning is that every test is "just another test" by itself, but can gradually add to the build times. |
||
| for (int i = 0; i < 10; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "doc" + i)); | ||
| } | ||
| assertU(commit("openSearcher", "true")); | ||
|
|
||
| assertUpgradeDoesNotDetectChildDocs(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testChildDocsDetection_withChildDocsJustAdd() throws Exception { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same argument on duplicate coverage as above through existing test testUpgradeCoreIndexFailsWithNestedDocuments() (now renamed to testUpgradeCoreIndexFailsWithChildDocuments() in this PR) |
||
| addChildDoc("100", "101"); | ||
| addChildDoc("200", "201"); | ||
| assertU(commit("openSearcher", "true")); | ||
|
|
||
| assertUpgradeDetectsChildDocs(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testChildDocsDetection_noChildDocsWithWithinCommitUpdates() throws Exception { | ||
| // Add docs and then update some of them BEFORE committing, so both the old | ||
| // (deleted) and new versions end up in the same flushed segment. | ||
| // With NoMergePolicy and a 100MB RAM buffer (from SolrIndexConfig defaults), | ||
| // no flush or merge occurs mid-batch, guaranteeing co-location. | ||
| // | ||
| // In the resulting segment, _root_ Terms stats will show: | ||
| // Terms.size() = N (unique _root_ values, one per unique id) | ||
| // Terms.getDocCount() = N + updates (includes deleted doc entries) | ||
| // | ||
| // A naive check (uniqueRootValues < docsWithRoot) may false-positive here | ||
| // because multiple docs share the same _root_ value within the segment. | ||
| for (int i = 0; i < 10; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "doc" + i)); | ||
| } | ||
| // Re-add a few docs with the same ids (within-commit updates) | ||
| for (int i = 0; i < 3; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "updated_doc" + i)); | ||
| } | ||
| assertU(commit("openSearcher", "true")); | ||
|
|
||
| // 10 live docs — the updates replaced 3 docs in-place | ||
| assertQ(req("q", "*:*"), "//result[@numFound='10']"); | ||
| assertUpgradeDoesNotDetectChildDocs(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testChildDocsDetection_withChildDocsWithWithinCommitUpdates() throws Exception { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd argue that all tests of the pattern testChildDocsDetection_withChildDocs* are getting covered through the existing test testUpgradeCoreIndexFailsWithChildDocuments() since the with vs within(CommitUpdates) condition is not a real differentiator in that case. testChildDocsDetection_noChildDocsWithWithinCommitUpdates() and testChildDocsDetection_noChildDocsWithWithinCommitDeletesAndUpdates() is all we need?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the current algorithm they are equivalent but things can change so this is more for regression prevention. I think it's possible to imagine some future code path that only gets exercised when That's why I proposed moving the child doc check to a central location (where they can also be exposed by some info endpoint as well). I know @dsmiley expressed interest in exposing the child doc check via some info endpoint in the original PR (luke or maybe in core/collection status?). If we were to go that direction then we'd probably want to move this check to some centralized utility.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Totally agree with the idea of a utility function for this check. SolrCore as @dsmiley suggested seems logical. Luke handler seems fine too since it gives core level stats. If you don't mind, can you please elaborate the case of "all deletes via update" that you mentioned? Trying to make sure I fully comprehend it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider three statistics that are similar but subtly different for "id-like" fields:
However, when you have deletions where none of the deletions are part of updates then you have:
I want to add tests which protect against the wrong kind of refactor which IMO is not that hard considering the subtelties of the different types of counts. I know they confused me a bit until I considered all the cases mentioned above. |
||
| // Same within-commit pattern but with actual child docs present | ||
| addChildDoc("100", "101"); | ||
|
|
||
| // Add and immediately re-add some non-child docs | ||
| for (int i = 0; i < 5; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "doc" + i)); | ||
| } | ||
| for (int i = 0; i < 3; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "updated_doc" + i)); | ||
| } | ||
| assertU(commit("openSearcher", "true")); | ||
|
|
||
| assertUpgradeDetectsChildDocs(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testChildDocsDetection_noChildDocsWithWithinCommitDeletesAndUpdates() | ||
| throws Exception { | ||
| // Add docs, delete some, and update others — all before committing. | ||
| // Deleted and updated docs leave behind deleted entries in the same segment, | ||
| // which can cause false positives in the child docs detection. | ||
| for (int i = 0; i < 10; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "doc" + i)); | ||
| } | ||
| // Delete a few | ||
| assertU(delI("3")); | ||
| assertU(delI("4")); | ||
| assertU(delI("5")); | ||
| // Update a few others | ||
| for (int i = 0; i < 3; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "updated_doc" + i)); | ||
| } | ||
| assertU(commit("openSearcher", "true")); | ||
|
|
||
| // 7 live docs: ids 0,1,2 (updated), 6,7,8,9 (untouched); 3,4,5 deleted | ||
| assertQ(req("q", "*:*"), "//result[@numFound='7']"); | ||
| assertUpgradeDoesNotDetectChildDocs(); | ||
| } | ||
|
|
||
| @Test | ||
| public void testChildDocsDetection_withChildDocsWithWithinCommitDeletesAndUpdates() | ||
| throws Exception { | ||
| addChildDoc("100", "101"); | ||
|
|
||
| for (int i = 0; i < 5; i++) { | ||
| assertU(adoc("id", String.valueOf(i), "title", "doc" + i)); | ||
| } | ||
| assertU(delI("3")); | ||
| assertU(delI("4")); | ||
| assertU(adoc("id", "0", "title", "updated_doc0")); | ||
| assertU(commit("openSearcher", "true")); | ||
|
|
||
| assertUpgradeDetectsChildDocs(); | ||
| } | ||
|
|
||
| /** Index a parent document with a single child via the update handler. */ | ||
| private void addChildDoc(String parentId, String childId) throws Exception { | ||
| SolrCore core = h.getCore(); | ||
| SolrInputDocument parentDoc = new SolrInputDocument(); | ||
| parentDoc.addField("id", parentId); | ||
| parentDoc.addField("title", "Parent " + parentId); | ||
|
|
||
| SolrInputDocument childDoc = new SolrInputDocument(); | ||
| childDoc.addField("id", childId); | ||
| childDoc.addField("title", "Child " + childId); | ||
| parentDoc.addChildDocument(childDoc); | ||
|
|
||
| try (SolrQueryRequestBase solrReq = | ||
| new SolrQueryRequestBase(core, new ModifiableSolrParams())) { | ||
| AddUpdateCommand cmd = new AddUpdateCommand(solrReq); | ||
| cmd.solrDoc = parentDoc; | ||
| core.getUpdateHandler().addDoc(cmd); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Assert that the upgrade endpoint does NOT throw the child-documents error. This verifies that | ||
| * {@code indexContainsChildDocs} returns false. | ||
| */ | ||
| private void assertUpgradeDoesNotDetectChildDocs() throws Exception { | ||
| final String coreName = h.getCore().getName(); | ||
| CoreAdminHandler admin = new CoreAdminHandler(h.getCoreContainer()); | ||
| try { | ||
| final SolrQueryResponse resp = new SolrQueryResponse(); | ||
| admin.handleRequestBody( | ||
| req( | ||
| CoreAdminParams.ACTION, | ||
| CoreAdminParams.CoreAdminAction.UPGRADECOREINDEX.toString(), | ||
| CoreAdminParams.CORE, | ||
| coreName), | ||
| resp); | ||
| assertNull("Unexpected exception: " + resp.getException(), resp.getException()); | ||
| } finally { | ||
| admin.shutdown(); | ||
| admin.close(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Assert that the upgrade endpoint DOES throw the child-documents error. This verifies that | ||
| * {@code indexContainsChildDocs} returns true. | ||
| */ | ||
| private void assertUpgradeDetectsChildDocs() throws Exception { | ||
| final String coreName = h.getCore().getName(); | ||
| CoreAdminHandler admin = new CoreAdminHandler(h.getCoreContainer()); | ||
| try { | ||
| final SolrQueryResponse resp = new SolrQueryResponse(); | ||
| SolrException thrown = | ||
| assertThrows( | ||
| SolrException.class, | ||
| () -> | ||
| admin.handleRequestBody( | ||
| req( | ||
| CoreAdminParams.ACTION, | ||
| CoreAdminParams.CoreAdminAction.UPGRADECOREINDEX.toString(), | ||
| CoreAdminParams.CORE, | ||
| coreName), | ||
| resp)); | ||
| assertThat( | ||
| thrown.getMessage(), | ||
| containsString("does not support indexes containing child documents")); | ||
| } finally { | ||
| admin.shutdown(); | ||
| admin.close(); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.