Skip to content
39 changes: 39 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3614,6 +3614,16 @@
in SCM and RECON.
</description>
</property>
<property>
<name>ozone.recon.scm.per.state.drift.threshold</name>
<value>5</value>
<tag>OZONE, RECON, SCM</tag>
<description>
Per-state lifecycle drift threshold used when SCM and Recon total container
counts are equal. If OPEN, QUASI_CLOSED, or derived CLOSED counts differ by
more than this value, Recon triggers a targeted SCM container sync.
</description>
</property>
<property>
<name>ozone.recon.scm.snapshot.enabled</name>
<value>true</value>
Expand Down Expand Up @@ -4600,6 +4610,35 @@
Interval in MINUTES by Recon to request SCM DB Snapshot.
</description>
</property>
<property>
<name>ozone.recon.scm.container.sync.task.initial.delay</name>
<value>2m</value>
<tag>OZONE, MANAGEMENT, RECON, SCM</tag>
<description>
Initial delay before Recon starts the incremental SCM container sync task.
This is slightly later than the SCM snapshot initial delay so the snapshot
can initialize Recon's SCM DB before the first incremental sync runs.
</description>
</property>
<property>
<name>ozone.recon.scm.container.sync.task.interval.delay</name>
<value>1h</value>
<tag>OZONE, MANAGEMENT, RECON, SCM</tag>
<description>
Interval between incremental SCM container sync runs in Recon. Each cycle
evaluates drift between SCM and Recon and either runs the targeted
multi-pass sync or takes no action.
</description>
</property>
<property>
<name>ozone.recon.scm.deleted.container.check.batch.size</name>
<value>500</value>
<tag>OZONE, RECON, SCM, PERFORMANCE</tag>
<description>
Maximum number of CLOSED or QUASI_CLOSED Recon containers checked against
SCM per incremental sync cycle for DELETING or DELETED retirement.
</description>
</property>
<property>
<name>ozone.om.snapshot.compaction.dag.max.time.allowed</name>
<value>30d</value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ private void initialize() throws IOException {
final ContainerInfo container = iterator.next();
Objects.requireNonNull(container, "container == null");
containers.addContainer(container);
if (container.getState() == LifeCycleState.OPEN) {
if (container.getState() == LifeCycleState.OPEN
&& container.getPipelineID() != null) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any specific reason for adding this check? Do possible that container open is reported but pipeline does not exist (closed due to some error), can have some impact with the check

try {
pipelineManager.addContainerToPipelineSCMStart(
container.getPipelineID(), container.containerID());
Expand All @@ -260,8 +261,12 @@ private void initialize() throws IOException {
getContainerStateChangeActions() {
final Map<LifeCycleEvent, CheckedConsumer<ContainerInfo, IOException>>
actions = new EnumMap<>(LifeCycleEvent.class);
actions.put(FINALIZE, info -> pipelineManager
.removeContainerFromPipeline(info.getPipelineID(), info.containerID()));
actions.put(FINALIZE, info -> {
if (info.getPipelineID() != null) {
pipelineManager.removeContainerFromPipeline(
info.getPipelineID(), info.containerID());
}
});
return actions;
}

Expand Down Expand Up @@ -334,12 +339,16 @@ public void addContainer(final ContainerInfoProto containerInfo)
transactionBuffer.addToBuffer(containerStore,
containerID, container);
containers.addContainer(container);
if (pipelineManager.containsPipeline(pipelineID)) {
if (pipelineID != null && pipelineManager.containsPipeline(pipelineID)) {
pipelineManager.addContainerToPipeline(pipelineID, containerID);
} else if (containerInfo.getState().
equals(LifeCycleState.OPEN)) {
// Pipeline should exist, but not
throw new PipelineNotFoundException();
if (pipelineID != null) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Null pipelineId also represent PipelineNotFound Exception

// OPEN containers normally require a live pipeline reference.
throw new PipelineNotFoundException();
}
LOG.warn("Adding OPEN container {} without pipeline tracking "
+ "because its pipeline ID is null.", containerID);
}
//recon may receive report of closed container,
// no corresponding Pipeline can be synced for scm.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,21 @@ public List<ContainerWithPipeline> getExistContainerWithPipelinesInBatch(
ContainerWithPipeline cp = getContainerWithPipelineCommon(containerID);
cpList.add(cp);
} catch (IOException ex) {
//not found , just go ahead
LOG.error("Container with common pipeline not found: {}", ex);
// Pipeline lookup failed (e.g., QUASI_CLOSED container whose pipeline
// has already been cleaned up). Return the container metadata without a
// pipeline so that callers (e.g., Recon's sync) can still record the
// container rather than losing it silently.
LOG.warn("Pipeline lookup failed for container {}; returning container "
+ "without pipeline. Cause: {}", containerID, ex.getMessage());
try {
ContainerInfo info = scm.getContainerManager()
.getContainer(ContainerID.valueOf(containerID));
cpList.add(new ContainerWithPipeline(info, null));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we returning pipeline as null here ?

} catch (ContainerNotFoundException notFound) {
// Container truly does not exist in SCM — exclude it from the result.
LOG.error("Container {} not found in SCM and will not be returned "
+ "to caller.", containerID, notFound);
}
}
}
return cpList;
Expand Down
10 changes: 5 additions & 5 deletions hadoop-ozone/dist/src/main/compose/ozone/docker-config
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ CORE-SITE.XML_hadoop.proxyuser.hadoop.groups=*
OZONE-SITE.XML_ozone.om.address=om
OZONE-SITE.XML_ozone.om.http-address=om:9874
OZONE-SITE.XML_ozone.scm.http-address=scm:9876
OZONE-SITE.XML_ozone.scm.container.size=1GB
OZONE-SITE.XML_ozone.scm.block.size=1MB
OZONE-SITE.XML_ozone.scm.container.size=1MB
OZONE-SITE.XML_ozone.scm.block.size=100KB
OZONE-SITE.XML_ozone.scm.datanode.ratis.volume.free-space.min=10MB
OZONE-SITE.XML_ozone.scm.pipeline.creation.interval=30s
OZONE-SITE.XML_ozone.scm.pipeline.owner.container.count=1
Expand All @@ -42,7 +42,7 @@ OZONE-SITE.XML_ozone.recon.address=recon:9891
OZONE-SITE.XML_ozone.recon.http-address=0.0.0.0:9888
OZONE-SITE.XML_ozone.recon.https-address=0.0.0.0:9889
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
OZONE-SITE.XML_ozone.recon.om.snapshot.task.initial.delay=20s
OZONE-SITE.XML_ozone.recon.task.missingcontainer.interval=1m
OZONE-SITE.XML_ozone.datanode.pipeline.limit=1
OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE-SITE.XML_hdds.container.report.interval=60s
Expand All @@ -51,8 +51,8 @@ OZONE-SITE.XML_ozone.scm.dead.node.interval=45s
OZONE-SITE.XML_hdds.heartbeat.interval=5s
OZONE-SITE.XML_ozone.scm.close.container.wait.duration=5s
OZONE-SITE.XML_hdds.scm.replication.thread.interval=15s
OZONE-SITE.XML_hdds.scm.replication.under.replicated.interval=5s
OZONE-SITE.XML_hdds.scm.replication.over.replicated.interval=5s
OZONE-SITE.XML_hdds.scm.replication.under.replicated.interval=1m
OZONE-SITE.XML_hdds.scm.replication.over.replicated.interval=3m
OZONE-SITE.XML_hdds.scm.wait.time.after.safemode.exit=30s
OZONE-SITE.XML_ozone.http.basedir=/tmp/ozone_http

Expand Down
Loading