Skip to content
Open
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
title: Fix nested docs detection false positive
type: changed
authors:
- name: Luke Kot-Zaniewski
links:
- name: SOLR-18194
url: https://issues.apache.org/jira/browse/SOLR-18194
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,11 @@ private UpgradeCoreIndexResponse performUpgradeImpl(

RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
try {
// Check for nested documents before processing - we don't support them
if (indexContainsNestedDocs(searcherRef.get())) {
// Check for child documents before processing - we don't support them
if (indexContainsChildDocs(searcherRef.get())) {
throw new SolrException(
BAD_REQUEST,
"UPGRADECOREINDEX does not support indexes containing nested documents. "
"UPGRADECOREINDEX does not support indexes containing child documents. "
Comment thread
kotman12 marked this conversation as resolved.
Outdated
+ " Consider reindexing your data "
+ "from the original source.");
}
Expand Down Expand Up @@ -259,26 +259,44 @@ private boolean shouldUpgradeSegment(LeafReaderContext lrc) {
return (segmentMinVersion == null || segmentMinVersion.major < Version.LATEST.major);
}

private boolean indexContainsNestedDocs(SolrIndexSearcher searcher) throws IOException {
private boolean indexContainsChildDocs(SolrIndexSearcher searcher) throws IOException {
IndexSchema schema = searcher.getSchema();

// First check if schema supports nested docs
// First check if schema supports child docs
if (!schema.isUsableForChildDocs()) {
return false;
}

// Check if _root_ field has fewer unique values than documents with that field.
// This indicates multiple docs share the same _root_ (i.e., child docs exist)
String uniqueKeyFieldName = schema.getUniqueKeyField().getName();

// Compare unique _root_ values against unique id values per segment.
// For non-child docs, every document's _root_ equals its own id, so the number of
// distinct _root_ values equals the number of distinct id values. For child docs,
// children share the parent's _root_ value, so there are fewer distinct _root_ values
// than distinct id values.
//
// We intentionally compare against unique id values rather than Terms.getDocCount()
// (the number of documents with the _root_ field) because segment-level term statistics
// include deleted documents. Updates (delete + re-add of the same id) can leave multiple
// documents with the same _root_ value within a segment, causing getDocCount() to exceed
// the unique _root_ count even when no child docs exist.
IndexReader reader = searcher.getIndexReader();
for (LeafReaderContext leaf : reader.leaves()) {
Terms terms = leaf.reader().terms(IndexSchema.ROOT_FIELD_NAME);
if (terms != null) {
long uniqueRootValues = terms.size();
int docsWithRoot = terms.getDocCount();

if (uniqueRootValues == -1 || uniqueRootValues < docsWithRoot) {
return true; // Codec doesn't store number of terms (so a safe fallback), or multiple docs
// share same _root_ (aka nested docs exist)
Terms rootTerms = leaf.reader().terms(IndexSchema.ROOT_FIELD_NAME);
if (rootTerms != null) {
long uniqueRootValues = rootTerms.size();
if (uniqueRootValues == -1) {
return true; // Codec doesn't report term count; assume child docs as a safe fallback
}

Terms idTerms = leaf.reader().terms(uniqueKeyFieldName);
long uniqueIdValues = (idTerms != null) ? idTerms.size() : -1;
if (uniqueIdValues == -1) {
return true; // Codec doesn't report term count; assume child docs as a safe fallback
}

if (uniqueRootValues < uniqueIdValues) {
return true; // Fewer distinct _root_ values than distinct ids means child docs exist
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,11 @@ private void setMinVersionForSegments(SolrCore core, Set<String> segments, Versi
private record SegmentLayout(String coreName, String seg1, String seg2, String seg3) {}

@Test
public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception {
public void testUpgradeCoreIndexFailsWithChildDocuments() throws Exception {
final SolrCore core = h.getCore();
final String coreName = core.getName();

// Create a parent document with a child document (nested doc)
// Create a parent document with a child document
SolrInputDocument parentDoc = new SolrInputDocument();
parentDoc.addField("id", "100");
parentDoc.addField("title", "Parent Document");
Expand All @@ -338,7 +338,7 @@ public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception {

parentDoc.addChildDocument(childDoc);

// Index the nested document
// Index the parent+child document
try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, new ModifiableSolrParams())) {
AddUpdateCommand cmd = new AddUpdateCommand(req);
cmd.solrDoc = parentDoc;
Expand All @@ -349,7 +349,7 @@ public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception {
// Verify documents were indexed (parent + child = 2 docs)
assertQ(req("q", "*:*"), "//result[@numFound='2']");

// Attempt to upgrade the index - should fail because of nested documents
// Attempt to upgrade the index - should fail because of child documents
CoreAdminHandler admin = new CoreAdminHandler(h.getCoreContainer());
try {
final SolrQueryResponse resp = new SolrQueryResponse();
Expand All @@ -365,10 +365,192 @@ public void testUpgradeCoreIndexFailsWithNestedDocuments() throws Exception {
coreName),
resp));

// Verify the exception message indicates nested documents are not supported
// Verify the exception message indicates child documents are not supported
assertThat(
thrown.getMessage(),
containsString("does not support indexes containing nested documents"));
containsString("does not support indexes containing child documents"));
} finally {
admin.shutdown();
admin.close();
}
}

// --- Child docs detection tests ---
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[0] Wow lots of tests... and they seem only slightly tweaked amongst each. My observation is that Lucene/Solr old-timers (like me) prefer to write a minimal number of randomized test instead of writing many repeated tests that tweak something. I noticed LLMs prefer the latter. But it's fine, really!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are indeed LLM generated but I was very intentional about what scenarios to test, i.e. hasChildDocs X hasUpdates X hasDeletes. These variables have different effects on the distinct count vs docCount of both id and _root_ and would like any hypothetical future refactoring to take into account each since the logic is a function of these variables. I am not convinced a randomized test would have been effective here without knowing what edge cases you are looking for a priori. I suppose in theory it could have caught other things I hadn't considered although seems unlikely given the relatively simple logic and few moving parts (famous last words). Thus, there is advantage to deterministically validating these known edge cases with each build. That being said, it feels like overkill to test these in UpgradeCoreIndex as this check is a rather small component of the overall operation. Perhaps this check should live in some utility function? Then we could also call it from, say, the LukeHandler and expose a hasNested/ChildDocs in the index info. The benefit of such thorough testing would be more obvious in a targeted utility. What do you think?

As an aside, is your reflexive urge to use randomized testing here motivated by aesthetic reasons? If so, I could consider parameterizing them. I think it should be possible.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for explaining why the tests are what they are. Seems perfect then :-)

No; not aesthetic reasons. Well okay slightly but whatever.

I love the idea of a utility function somewhere! SolrCore? I hate to suggest that place given SolrCore's size but it would at least have some logical sense to its placement there, and discoverability, which matters.

//
// These tests verify that the child document detection in the upgrade path
// correctly distinguishes between genuine child docs and non-child docs,
// even in the presence of updates and deletes that leave deleted documents
// in segments (since NoMergePolicy prevents segment merges from purging them).

@Test
public void testChildDocsDetection_noChildDocsJustAdd() throws Exception {
Copy link
Copy Markdown
Member

@rahulgoswami rahulgoswami Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the intent that we are trying to be explicit about the edge cases of the child docs check. In certain cases however, I'd argue that certain tests like this one are effectively testing the happy path which is already getting covered inherently in existing tests like testNoUpgradeNeededWhenAllSegmentsCurrent().

I appreciate the thorough tests, but I think we have an opportunity to keep the coverage tight here by relying on certain existing tests like above. Another reasoning is that every test is "just another test" by itself, but can gradually add to the build times.

for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i), "title", "doc" + i));
}
assertU(commit("openSearcher", "true"));

assertUpgradeDoesNotDetectChildDocs();
}

@Test
public void testChildDocsDetection_withChildDocsJustAdd() throws Exception {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same argument on duplicate coverage as above through existing test testUpgradeCoreIndexFailsWithNestedDocuments() (now renamed to testUpgradeCoreIndexFailsWithChildDocuments() in this PR)

addChildDoc("100", "101");
addChildDoc("200", "201");
assertU(commit("openSearcher", "true"));

assertUpgradeDetectsChildDocs();
}

@Test
public void testChildDocsDetection_noChildDocsWithWithinCommitUpdates() throws Exception {
// Add docs and then update some of them BEFORE committing, so both the old
// (deleted) and new versions end up in the same flushed segment.
// With NoMergePolicy and a 100MB RAM buffer (from SolrIndexConfig defaults),
// no flush or merge occurs mid-batch, guaranteeing co-location.
//
// In the resulting segment, _root_ Terms stats will show:
// Terms.size() = N (unique _root_ values, one per unique id)
// Terms.getDocCount() = N + updates (includes deleted doc entries)
//
// A naive check (uniqueRootValues < docsWithRoot) may false-positive here
// because multiple docs share the same _root_ value within the segment.
for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i), "title", "doc" + i));
}
// Re-add a few docs with the same ids (within-commit updates)
for (int i = 0; i < 3; i++) {
assertU(adoc("id", String.valueOf(i), "title", "updated_doc" + i));
}
assertU(commit("openSearcher", "true"));

// 10 live docs — the updates replaced 3 docs in-place
assertQ(req("q", "*:*"), "//result[@numFound='10']");
assertUpgradeDoesNotDetectChildDocs();
}

@Test
public void testChildDocsDetection_withChildDocsWithWithinCommitUpdates() throws Exception {
Copy link
Copy Markdown
Member

@rahulgoswami rahulgoswami Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd argue that all tests of the pattern testChildDocsDetection_withChildDocs* are getting covered through the existing test testUpgradeCoreIndexFailsWithChildDocuments() since the with vs within(CommitUpdates) condition is not a real differentiator in that case.

testChildDocsDetection_noChildDocsWithWithinCommitUpdates() and testChildDocsDetection_noChildDocsWithWithinCommitDeletesAndUpdates() is all we need?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current algorithm they are equivalent but things can change so this is more for regression prevention. I think it's possible to imagine some future code path that only gets exercised when unique(id) == idTerms.size() (which is the case only if all deletes are via update) or vice versa. We can also randomize the counts of docs added vs deleted vs updated if we are really worried about the incremental build time here. All that being said, it feels like overkill for the UpgradeCoreIndex tests because this isn't even the main point of this class.

That's why I proposed moving the child doc check to a central location (where they can also be exposed by some info endpoint as well). I know @dsmiley expressed interest in exposing the child doc check via some info endpoint in the original PR (luke or maybe in core/collection status?). If we were to go that direction then we'd probably want to move this check to some centralized utility.

Copy link
Copy Markdown
Member

@rahulgoswami rahulgoswami Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally agree with the idea of a utility function for this check. SolrCore as @dsmiley suggested seems logical. Luke handler seems fine too since it gives core level stats.

If you don't mind, can you please elaborate the case of "all deletes via update" that you mentioned? Trying to make sure I fully comprehend it.

Copy link
Copy Markdown
Contributor Author

@kotman12 kotman12 Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider three statistics that are similar but subtly different for "id-like" fields: terms.size, terms.docCount, unique(id) (i.e. facet via searcher). When all your deletions are via updates, meaning every deletion of a solr Id is paired with an addition of that same solr Id then you have:

terms.docCount > terms.size = unique(id)

However, when you have deletions where none of the deletions are part of updates then you have:

terms.docCount = terms.size > unique(id)

I want to add tests which protect against the wrong kind of refactor which IMO is not that hard considering the subtelties of the different types of counts. I know they confused me a bit until I considered all the cases mentioned above.

// Same within-commit pattern but with actual child docs present
addChildDoc("100", "101");

// Add and immediately re-add some non-child docs
for (int i = 0; i < 5; i++) {
assertU(adoc("id", String.valueOf(i), "title", "doc" + i));
}
for (int i = 0; i < 3; i++) {
assertU(adoc("id", String.valueOf(i), "title", "updated_doc" + i));
}
assertU(commit("openSearcher", "true"));

assertUpgradeDetectsChildDocs();
}

@Test
public void testChildDocsDetection_noChildDocsWithWithinCommitDeletesAndUpdates()
throws Exception {
// Add docs, delete some, and update others — all before committing.
// Deleted and updated docs leave behind deleted entries in the same segment,
// which can cause false positives in the child docs detection.
for (int i = 0; i < 10; i++) {
assertU(adoc("id", String.valueOf(i), "title", "doc" + i));
}
// Delete a few
assertU(delI("3"));
assertU(delI("4"));
assertU(delI("5"));
// Update a few others
for (int i = 0; i < 3; i++) {
assertU(adoc("id", String.valueOf(i), "title", "updated_doc" + i));
}
assertU(commit("openSearcher", "true"));

// 7 live docs: ids 0,1,2 (updated), 6,7,8,9 (untouched); 3,4,5 deleted
assertQ(req("q", "*:*"), "//result[@numFound='7']");
assertUpgradeDoesNotDetectChildDocs();
}

@Test
public void testChildDocsDetection_withChildDocsWithWithinCommitDeletesAndUpdates()
throws Exception {
addChildDoc("100", "101");

for (int i = 0; i < 5; i++) {
assertU(adoc("id", String.valueOf(i), "title", "doc" + i));
}
assertU(delI("3"));
assertU(delI("4"));
assertU(adoc("id", "0", "title", "updated_doc0"));
assertU(commit("openSearcher", "true"));

assertUpgradeDetectsChildDocs();
}

/** Index a parent document with a single child via the update handler. */
private void addChildDoc(String parentId, String childId) throws Exception {
SolrCore core = h.getCore();
SolrInputDocument parentDoc = new SolrInputDocument();
parentDoc.addField("id", parentId);
parentDoc.addField("title", "Parent " + parentId);

SolrInputDocument childDoc = new SolrInputDocument();
childDoc.addField("id", childId);
childDoc.addField("title", "Child " + childId);
parentDoc.addChildDocument(childDoc);

try (SolrQueryRequestBase solrReq =
new SolrQueryRequestBase(core, new ModifiableSolrParams())) {
AddUpdateCommand cmd = new AddUpdateCommand(solrReq);
cmd.solrDoc = parentDoc;
core.getUpdateHandler().addDoc(cmd);
}
}

/**
* Assert that the upgrade endpoint does NOT throw the child-documents error. This verifies that
* {@code indexContainsChildDocs} returns false.
*/
private void assertUpgradeDoesNotDetectChildDocs() throws Exception {
final String coreName = h.getCore().getName();
CoreAdminHandler admin = new CoreAdminHandler(h.getCoreContainer());
try {
final SolrQueryResponse resp = new SolrQueryResponse();
admin.handleRequestBody(
req(
CoreAdminParams.ACTION,
CoreAdminParams.CoreAdminAction.UPGRADECOREINDEX.toString(),
CoreAdminParams.CORE,
coreName),
resp);
assertNull("Unexpected exception: " + resp.getException(), resp.getException());
} finally {
admin.shutdown();
admin.close();
}
}

/**
* Assert that the upgrade endpoint DOES throw the child-documents error. This verifies that
* {@code indexContainsChildDocs} returns true.
*/
private void assertUpgradeDetectsChildDocs() throws Exception {
final String coreName = h.getCore().getName();
CoreAdminHandler admin = new CoreAdminHandler(h.getCoreContainer());
try {
final SolrQueryResponse resp = new SolrQueryResponse();
SolrException thrown =
assertThrows(
SolrException.class,
() ->
admin.handleRequestBody(
req(
CoreAdminParams.ACTION,
CoreAdminParams.CoreAdminAction.UPGRADECOREINDEX.toString(),
CoreAdminParams.CORE,
coreName),
resp));
assertThat(
thrown.getMessage(),
containsString("does not support indexes containing child documents"));
} finally {
admin.shutdown();
admin.close();
Expand Down
Loading