diff --git a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java
index 468f6217ecd9499c3a94c70b52527688ba2b14d5..b0263d3b5a94b6ddd81ce41c6bd21a8637ca6ebc 100644
--- a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java
+++ b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerNode.java
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -60,534 +60,534 @@ import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;
@Unstable
public abstract class SchedulerNode {
- private static final Logger LOG =
- LoggerFactory.getLogger(SchedulerNode.class);
-
- private Resource unallocatedResource = Resource.newInstance(0, 0);
- private Resource allocatedResource = Resource.newInstance(0, 0);
- private Resource totalResource;
- private RMContainer reservedContainer;
- private volatile int numContainers;
- private volatile ResourceUtilization containersUtilization =
- ResourceUtilization.newInstance(0, 0, 0f);
- private volatile ResourceUtilization nodeUtilization =
- ResourceUtilization.newInstance(0, 0, 0f);
- /** Time stamp for overcommitted resources to time out. */
- private long overcommitTimeout = -1;
-
- /* set of containers that are allocated containers */
- private final Map launchedContainers =
- new HashMap<>();
-
- private final RMNode rmNode;
- private final String nodeName;
- private final RMContext rmContext;
-
- private volatile Set labels = null;
-
- private volatile Set nodeAttributes = null;
-
- // Last updated time
- private volatile long lastHeartbeatMonotonicTime;
-
- public SchedulerNode(RMNode node, boolean usePortForNodeName,
- Set labels) {
- this.rmNode = node;
- this.rmContext = node.getRMContext();
- this.unallocatedResource = Resources.clone(node.getTotalCapability());
- this.totalResource = Resources.clone(node.getTotalCapability());
- if (usePortForNodeName) {
- nodeName = rmNode.getHostName() + ":" + node.getNodeID().getPort();
- } else {
- nodeName = rmNode.getHostName();
- }
- this.labels = ImmutableSet.copyOf(labels);
- this.lastHeartbeatMonotonicTime = Time.monotonicNow();
- }
-
- public SchedulerNode(RMNode node, boolean usePortForNodeName) {
- this(node, usePortForNodeName, CommonNodeLabelsManager.EMPTY_STRING_SET);
- }
-
- public RMNode getRMNode() {
- return this.rmNode;
- }
-
- /**
- * Set total resources on the node.
- * @param resource Total resources on the node.
- */
- public synchronized void updateTotalResource(Resource resource){
- this.totalResource = resource;
- this.unallocatedResource = Resources.subtract(totalResource,
- this.allocatedResource);
- }
-
- /**
- * Set the timeout for the node to stop overcommitting the resources. After
- * this time the scheduler will start killing containers until the resources
- * are not overcommitted anymore. This may reset a previous timeout.
- * @param timeOut Time out in milliseconds.
- */
- public synchronized void setOvercommitTimeOut(long timeOut) {
- if (timeOut >= 0) {
- if (this.overcommitTimeout != -1) {
- LOG.debug("The overcommit timeout for {} was already set to {}",
- getNodeID(), this.overcommitTimeout);
- }
- this.overcommitTimeout = Time.now() + timeOut;
- }
- }
-
- /**
- * Check if the time out has passed.
- * @return If the node is overcommitted.
- */
- public synchronized boolean isOvercommitTimedOut() {
- return this.overcommitTimeout >= 0 && Time.now() >= this.overcommitTimeout;
- }
-
- /**
- * Check if the node has a time out for overcommit resources.
- * @return If the node has a time out for overcommit resources.
- */
- public synchronized boolean isOvercommitTimeOutSet() {
- return this.overcommitTimeout >= 0;
- }
-
- /**
- * Get the ID of the node which contains both its hostname and port.
- * @return The ID of the node.
- */
- public NodeId getNodeID() {
- return this.rmNode.getNodeID();
- }
-
- /**
- * Get HTTP address for the node.
- * @return HTTP address for the node.
- */
- public String getHttpAddress() {
- return this.rmNode.getHttpAddress();
- }
-
- /**
- * Get the name of the node for scheduling matching decisions.
- *
- * Typically this is the 'hostname' reported by the node, but it could be
- * configured to be 'hostname:port' reported by the node via the
- * {@link YarnConfiguration#RM_SCHEDULER_INCLUDE_PORT_IN_NODE_NAME} constant.
- * The main usecase of this is YARN minicluster to be able to differentiate
- * node manager instances by their port number.
- * @return Name of the node for scheduling matching decisions.
- */
- public String getNodeName() {
- return nodeName;
- }
-
- /**
- * Get rackname.
- * @return rackname
- */
- public String getRackName() {
- return this.rmNode.getRackName();
- }
-
- /**
- * The Scheduler has allocated containers on this node to the given
- * application.
- * @param rmContainer Allocated container
- */
- public void allocateContainer(RMContainer rmContainer) {
- allocateContainer(rmContainer, false);
- }
-
- /**
- * The Scheduler has allocated containers on this node to the given
- * application.
- * @param rmContainer Allocated container
- * @param launchedOnNode True if the container has been launched
- */
- protected synchronized void allocateContainer(RMContainer rmContainer,
- boolean launchedOnNode) {
- Container container = rmContainer.getContainer();
- if (rmContainer.getExecutionType() == ExecutionType.GUARANTEED) {
- deductUnallocatedResource(container.getResource());
- ++numContainers;
- }
-
- launchedContainers.put(container.getId(),
- new ContainerInfo(rmContainer, launchedOnNode));
- }
-
- /**
- * Get unallocated resources on the node.
- * @return Unallocated resources on the node
- */
- public synchronized Resource getUnallocatedResource() {
- return this.unallocatedResource;
- }
-
- /**
- * Get allocated resources on the node.
- * @return Allocated resources on the node
- */
- public synchronized Resource getAllocatedResource() {
- return this.allocatedResource;
- }
-
- /**
- * Get total resources on the node.
- * @return Total resources on the node.
- */
- public synchronized Resource getTotalResource() {
- return this.totalResource;
- }
-
- /**
- * Check if a container is launched by this node.
- * @return If the container is launched by the node.
- */
- public synchronized boolean isValidContainer(ContainerId containerId) {
- if (launchedContainers.containsKey(containerId)) {
- return true;
- }
- return false;
- }
-
- /**
- * Update the resources of the node when releasing a container.
- * @param container Container to release.
- */
- protected synchronized void updateResourceForReleasedContainer(
- Container container) {
- if (container.getExecutionType() == ExecutionType.GUARANTEED) {
- addUnallocatedResource(container.getResource());
- --numContainers;
- }
- }
-
- /**
- * Release an allocated container on this node.
- * @param containerId ID of container to be released.
- * @param releasedByNode whether the release originates from a node update.
- */
- public synchronized void releaseContainer(ContainerId containerId,
- boolean releasedByNode) {
- ContainerInfo info = launchedContainers.get(containerId);
- if (info == null) {
- return;
- }
- if (!releasedByNode && info.launchedOnNode) {
- // wait until node reports container has completed
- return;
- }
-
- launchedContainers.remove(containerId);
- Container container = info.container.getContainer();
-
- // We remove allocation tags when a container is actually
- // released on NM. This is to avoid running into situation
- // when AM releases a container and NM has some delay to
- // actually release it, then the tag can still be visible
- // at RM so that RM can respect it during scheduling new containers.
- if (rmContext != null && rmContext.getAllocationTagsManager() != null) {
- rmContext.getAllocationTagsManager()
- .removeContainer(container.getNodeId(),
- container.getId(), container.getAllocationTags());
- }
-
- updateResourceForReleasedContainer(container);
-
- if (LOG.isDebugEnabled()) {
- LOG.debug("Released container " + container.getId() + " of capacity "
- + container.getResource() + " on host " + rmNode.getNodeAddress()
- + ", which currently has " + numContainers + " containers, "
- + getAllocatedResource() + " used and " + getUnallocatedResource()
- + " available" + ", release resources=" + true);
- }
- }
-
- /**
- * Inform the node that a container has launched.
- * @param containerId ID of the launched container
- */
- public synchronized void containerStarted(ContainerId containerId) {
- ContainerInfo info = launchedContainers.get(containerId);
- if (info != null) {
- info.launchedOnNode = true;
- }
- }
-
- /**
- * Add unallocated resources to the node. This is used when unallocating a
- * container.
- * @param resource Resources to add.
- */
- private synchronized void addUnallocatedResource(Resource resource) {
- if (resource == null) {
- LOG.error("Invalid resource addition of null resource for "
- + rmNode.getNodeAddress());
- return;
- }
- Resources.addTo(unallocatedResource, resource);
- Resources.subtractFrom(allocatedResource, resource);
- }
-
- /**
- * Deduct unallocated resources from the node. This is used when allocating a
- * container.
- * @param resource Resources to deduct.
- */
- @VisibleForTesting
- public synchronized void deductUnallocatedResource(Resource resource) {
- if (resource == null) {
- LOG.error("Invalid deduction of null resource for "
- + rmNode.getNodeAddress());
- return;
- }
- Resources.subtractFrom(unallocatedResource, resource);
- Resources.addTo(allocatedResource, resource);
- }
-
- /**
- * Reserve container for the attempt on this node.
- * @param attempt Application attempt asking for the reservation.
- * @param schedulerKey Priority of the reservation.
- * @param container Container reserving resources for.
- */
- public abstract void reserveResource(SchedulerApplicationAttempt attempt,
- SchedulerRequestKey schedulerKey, RMContainer container);
-
- /**
- * Unreserve resources on this node.
- * @param attempt Application attempt that had done the reservation.
- */
- public abstract void unreserveResource(SchedulerApplicationAttempt attempt);
-
- @Override
- public String toString() {
- return "host: " + rmNode.getNodeAddress() + " #containers="
- + getNumContainers() + " available=" + getUnallocatedResource()
- + " used=" + getAllocatedResource();
- }
-
- /**
- * Get number of active containers on the node.
- * @return Number of active containers on the node.
- */
- public int getNumContainers() {
- return numContainers;
- }
-
- /**
- * Get the containers running on the node.
- * @return A copy of containers running on the node.
- */
- public synchronized List getCopiedListOfRunningContainers() {
- List result = new ArrayList<>(launchedContainers.size());
- for (ContainerInfo info : launchedContainers.values()) {
- result.add(info.container);
- }
- return result;
- }
-
- /**
- * Get the containers running on the node with AM containers at the end.
- * @return A copy of running containers with AM containers at the end.
- */
- public synchronized List getRunningContainersWithAMsAtTheEnd() {
- LinkedList result = new LinkedList<>();
- for (ContainerInfo info : launchedContainers.values()) {
- if(info.container.isAMContainer()) {
- result.addLast(info.container);
- } else {
- result.addFirst(info.container);
- }
- }
- return result;
- }
-
- /**
- * Get the containers running on the node ordered by which to kill first. It
- * tries to kill AMs last, then GUARANTEED containers, and it kills
- * OPPORTUNISTIC first. If the same time, it uses the creation time.
- * @return A copy of the running containers ordered by which to kill first.
- */
- public List getContainersToKill() {
- List result = getLaunchedContainers();
- Collections.sort(result, (c1, c2) -> {
- return new CompareToBuilder()
- .append(c1.isAMContainer(), c2.isAMContainer())
- .append(c2.getExecutionType(), c1.getExecutionType()) // reversed
- .append(c2.getCreationTime(), c1.getCreationTime()) // reversed
- .toComparison();
- });
- return result;
- }
-
- /**
- * Get the launched containers in the node.
- * @return List of launched containers.
- */
- protected synchronized List getLaunchedContainers() {
- List result = new ArrayList<>();
- for (ContainerInfo info : launchedContainers.values()) {
- result.add(info.container);
- }
- return result;
- }
-
- /**
- * Get the container for the specified container ID.
- * @param containerId The container ID
- * @return The container for the specified container ID
- */
- protected synchronized RMContainer getContainer(ContainerId containerId) {
- RMContainer container = null;
- ContainerInfo info = launchedContainers.get(containerId);
- if (info != null) {
- container = info.container;
- }
- return container;
- }
-
- /**
- * Get the reserved container in the node.
- * @return Reserved container in the node.
- */
- public synchronized RMContainer getReservedContainer() {
- return reservedContainer;
- }
-
- /**
- * Set the reserved container in the node.
- * @param reservedContainer Reserved container in the node.
- */
- public synchronized void
- setReservedContainer(RMContainer reservedContainer) {
- this.reservedContainer = reservedContainer;
- }
-
- /**
- * Recover a container.
- * @param rmContainer Container to recover.
- */
- public synchronized void recoverContainer(RMContainer rmContainer) {
- if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
- return;
- }
- allocateContainer(rmContainer, true);
- }
-
- /**
- * Get the labels for the node.
- * @return Set of labels for the node.
- */
- public Set getLabels() {
- return labels;
- }
-
- /**
- * Update the labels for the node.
- * @param labels Set of labels for the node.
- */
- public void updateLabels(Set labels) {
- this.labels = labels;
- }
-
- /**
- * Get partition of which the node belongs to, if node-labels of this node is
- * empty or null, it belongs to NO_LABEL partition. And since we only support
- * one partition for each node (YARN-2694), first label will be its partition.
- * @return Partition for the node.
- */
- public String getPartition() {
- if (this.labels == null || this.labels.isEmpty()) {
- return RMNodeLabelsManager.NO_LABEL;
- } else {
- return this.labels.iterator().next();
- }
- }
-
- /**
- * Set the resource utilization of the containers in the node.
- * @param containersUtilization Resource utilization of the containers.
- */
- public void setAggregatedContainersUtilization(
- ResourceUtilization containersUtilization) {
- this.containersUtilization = containersUtilization;
- }
-
- /**
- * Get the resource utilization of the containers in the node.
- * @return Resource utilization of the containers.
- */
- public ResourceUtilization getAggregatedContainersUtilization() {
- return this.containersUtilization;
- }
-
- /**
- * Set the resource utilization of the node. This includes the containers.
- * @param nodeUtilization Resource utilization of the node.
- */
- public void setNodeUtilization(ResourceUtilization nodeUtilization) {
- this.nodeUtilization = nodeUtilization;
- }
-
- /**
- * Get the resource utilization of the node.
- * @return Resource utilization of the node.
- */
- public ResourceUtilization getNodeUtilization() {
- return this.nodeUtilization;
- }
-
- public long getLastHeartbeatMonotonicTime() {
- return lastHeartbeatMonotonicTime;
- }
-
- /**
- * This will be called for each node heartbeat.
- */
- public void notifyNodeUpdate() {
- this.lastHeartbeatMonotonicTime = Time.monotonicNow();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (!(o instanceof SchedulerNode)) {
- return false;
- }
-
- SchedulerNode that = (SchedulerNode) o;
-
- return getNodeID().equals(that.getNodeID());
- }
-
- @Override
- public int hashCode() {
- return getNodeID().hashCode();
- }
-
- public Set getNodeAttributes() {
- return nodeAttributes;
- }
-
- public void updateNodeAttributes(Set attributes) {
- this.nodeAttributes = attributes;
- }
-
- private static class ContainerInfo {
- private final RMContainer container;
- private boolean launchedOnNode;
-
- public ContainerInfo(RMContainer container, boolean launchedOnNode) {
- this.container = container;
- this.launchedOnNode = launchedOnNode;
- }
- }
+ private static final Logger LOG =
+ LoggerFactory.getLogger(SchedulerNode.class);
+
+ private Resource unallocatedResource = Resource.newInstance(0, 0);
+ private Resource allocatedResource = Resource.newInstance(0, 0);
+ private Resource totalResource;
+ private RMContainer reservedContainer;
+ private volatile int numContainers;
+ private volatile ResourceUtilization containersUtilization =
+ ResourceUtilization.newInstance(0, 0, 0f);
+ private volatile ResourceUtilization nodeUtilization =
+ ResourceUtilization.newInstance(0, 0, 0f);
+ /** Time stamp for overcommitted resources to time out. */
+ private long overcommitTimeout = -1;
+
+ /* set of containers that are allocated containers */
+ private final Map launchedContainers =
+ new HashMap<>();
+
+ private final RMNode rmNode;
+ private final String nodeName;
+ private final RMContext rmContext;
+
+ private volatile Set labels = null;
+
+ private volatile Set nodeAttributes = null;
+
+ // Last updated time
+ private volatile long lastHeartbeatMonotonicTime;
+
+ public SchedulerNode(RMNode node, boolean usePortForNodeName,
+ Set labels) {
+ this.rmNode = node;
+ this.rmContext = node.getRMContext();
+ this.unallocatedResource = Resources.clone(node.getTotalCapability());
+ this.totalResource = Resources.clone(node.getTotalCapability());
+ if (usePortForNodeName) {
+ nodeName = rmNode.getHostName() + ":" + node.getNodeID().getPort();
+ } else {
+ nodeName = rmNode.getHostName();
+ }
+ this.labels = ImmutableSet.copyOf(labels);
+ this.lastHeartbeatMonotonicTime = Time.monotonicNow();
+ }
+
+ public SchedulerNode(RMNode node, boolean usePortForNodeName) {
+ this(node, usePortForNodeName, CommonNodeLabelsManager.EMPTY_STRING_SET);
+ }
+
+ public RMNode getRMNode() {
+ return this.rmNode;
+ }
+
+ /**
+ * Set total resources on the node.
+ * @param resource Total resources on the node.
+ */
+ public synchronized void updateTotalResource(Resource resource) {
+ this.totalResource = resource;
+ this.unallocatedResource = Resources.subtract(totalResource,
+ this.allocatedResource);
+ }
+
+ /**
+ * Set the timeout for the node to stop overcommitting the resources. After
+ * this time the scheduler will start killing containers until the resources
+ * are not overcommitted anymore. This may reset a previous timeout.
+ * @param timeOut Time out in milliseconds.
+ */
+ public synchronized void setOvercommitTimeOut(long timeOut) {
+ if (timeOut >= 0) {
+ if (this.overcommitTimeout != -1) {
+ LOG.debug("The overcommit timeout for {} was already set to {}",
+ getNodeID(), this.overcommitTimeout);
+ }
+ this.overcommitTimeout = Time.now() + timeOut;
+ }
+ }
+
+ /**
+ * Check if the time out has passed.
+ * @return If the node is overcommitted.
+ */
+ public synchronized boolean isOvercommitTimedOut() {
+ return this.overcommitTimeout >= 0 && Time.now() >= this.overcommitTimeout;
+ }
+
+ /**
+ * Check if the node has a time out for overcommit resources.
+ * @return If the node has a time out for overcommit resources.
+ */
+ public synchronized boolean isOvercommitTimeOutSet() {
+ return this.overcommitTimeout >= 0;
+ }
+
+ /**
+ * Get the ID of the node which contains both its hostname and port.
+ * @return The ID of the node.
+ */
+ public NodeId getNodeID() {
+ return this.rmNode.getNodeID();
+ }
+
+ /**
+ * Get HTTP address for the node.
+ * @return HTTP address for the node.
+ */
+ public String getHttpAddress() {
+ return this.rmNode.getHttpAddress();
+ }
+
+ /**
+ * Get the name of the node for scheduling matching decisions.
+ *
+ * Typically this is the 'hostname' reported by the node, but it could be
+ * configured to be 'hostname:port' reported by the node via the
+ * {@link YarnConfiguration#RM_SCHEDULER_INCLUDE_PORT_IN_NODE_NAME} constant.
+ * The main usecase of this is YARN minicluster to be able to differentiate
+ * node manager instances by their port number.
+ * @return Name of the node for scheduling matching decisions.
+ */
+ public String getNodeName() {
+ return nodeName;
+ }
+
+ /**
+ * Get rackname.
+ * @return rackname
+ */
+ public String getRackName() {
+ return this.rmNode.getRackName();
+ }
+
+ /**
+ * The Scheduler has allocated containers on this node to the given
+ * application.
+ * @param rmContainer Allocated container
+ */
+ public void allocateContainer(RMContainer rmContainer) {
+ allocateContainer(rmContainer, false);
+ }
+
+ /**
+ * The Scheduler has allocated containers on this node to the given
+ * application.
+ * @param rmContainer Allocated container
+ * @param launchedOnNode True if the container has been launched
+ */
+ protected synchronized void allocateContainer(RMContainer rmContainer,
+ boolean launchedOnNode) {
+ Container container = rmContainer.getContainer();
+ if (rmContainer.getExecutionType() == ExecutionType.GUARANTEED) {
+ deductUnallocatedResource(container.getResource());
+ ++numContainers;
+ }
+
+ launchedContainers.put(container.getId(),
+ new ContainerInfo(rmContainer, launchedOnNode));
+ }
+
+ /**
+ * Get unallocated resources on the node.
+ * @return Unallocated resources on the node
+ */
+ public synchronized Resource getUnallocatedResource() {
+ return this.unallocatedResource;
+ }
+
+ /**
+ * Get allocated resources on the node.
+ * @return Allocated resources on the node
+ */
+ public synchronized Resource getAllocatedResource() {
+ return this.allocatedResource;
+ }
+
+ /**
+ * Get total resources on the node.
+ * @return Total resources on the node.
+ */
+ public synchronized Resource getTotalResource() {
+ return this.totalResource;
+ }
+
+ /**
+ * Check if a container is launched by this node.
+ * @return If the container is launched by the node.
+ */
+ public synchronized boolean isValidContainer(ContainerId containerId) {
+ if (launchedContainers.containsKey(containerId)) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Update the resources of the node when releasing a container.
+ * @param container Container to release.
+ */
+ protected synchronized void updateResourceForReleasedContainer(
+ Container container) {
+ if (container.getExecutionType() == ExecutionType.GUARANTEED) {
+ addUnallocatedResource(container.getResource());
+ --numContainers;
+ }
+ }
+
+ /**
+ * Release an allocated container on this node.
+ * @param containerId ID of container to be released.
+ * @param releasedByNode whether the release originates from a node update.
+ */
+ public synchronized void releaseContainer(ContainerId containerId,
+ boolean releasedByNode) {
+ ContainerInfo info = launchedContainers.get(containerId);
+ if (info == null) {
+ return;
+ }
+ if (!releasedByNode && info.launchedOnNode) {
+ // wait until node reports container has completed
+ return;
+ }
+
+ launchedContainers.remove(containerId);
+ Container container = info.container.getContainer();
+
+ // We remove allocation tags when a container is actually
+ // released on NM. This is to avoid running into situation
+ // when AM releases a container and NM has some delay to
+ // actually release it, then the tag can still be visible
+ // at RM so that RM can respect it during scheduling new containers.
+ if (rmContext != null && rmContext.getAllocationTagsManager() != null) {
+ rmContext.getAllocationTagsManager()
+ .removeContainer(container.getNodeId(),
+ container.getId(), container.getAllocationTags());
+ }
+
+ updateResourceForReleasedContainer(container);
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Released container " + container.getId() + " of capacity "
+ + container.getResource() + " on host " + rmNode.getNodeAddress()
+ + ", which currently has " + numContainers + " containers, "
+ + getAllocatedResource() + " used and " + getUnallocatedResource()
+ + " available" + ", release resources=" + true);
+ }
+ }
+
+ /**
+ * Inform the node that a container has launched.
+ * @param containerId ID of the launched container
+ */
+ public synchronized void containerStarted(ContainerId containerId) {
+ ContainerInfo info = launchedContainers.get(containerId);
+ if (info != null) {
+ info.launchedOnNode = true;
+ }
+ }
+
+ /**
+ * Add unallocated resources to the node. This is used when unallocating a
+ * container.
+ * @param resource Resources to add.
+ */
+ private synchronized void addUnallocatedResource(Resource resource) {
+ if (resource == null) {
+ LOG.error("Invalid resource addition of null resource for "
+ + rmNode.getNodeAddress());
+ return;
+ }
+ Resources.addTo(unallocatedResource, resource);
+ Resources.subtractFrom(allocatedResource, resource);
+ }
+
+ /**
+ * Deduct unallocated resources from the node. This is used when allocating a
+ * container.
+ * @param resource Resources to deduct.
+ */
+ @VisibleForTesting
+ public synchronized void deductUnallocatedResource(Resource resource) {
+ if (resource == null) {
+ LOG.error("Invalid deduction of null resource for "
+ + rmNode.getNodeAddress());
+ return;
+ }
+ Resources.subtractFrom(unallocatedResource, resource);
+ Resources.addTo(allocatedResource, resource);
+ }
+
+ /**
+ * Reserve container for the attempt on this node.
+ * @param attempt Application attempt asking for the reservation.
+ * @param schedulerKey Priority of the reservation.
+ * @param container Container reserving resources for.
+ */
+ public abstract void reserveResource(SchedulerApplicationAttempt attempt,
+ SchedulerRequestKey schedulerKey, RMContainer container);
+
+ /**
+ * Unreserve resources on this node.
+ * @param attempt Application attempt that had done the reservation.
+ */
+ public abstract void unreserveResource(SchedulerApplicationAttempt attempt);
+
+ @Override
+ public String toString() {
+ return "host: " + rmNode.getNodeAddress() + " #containers="
+ + getNumContainers() + " available=" + getUnallocatedResource()
+ + " used=" + getAllocatedResource();
+ }
+
+ /**
+ * Get number of active containers on the node.
+ * @return Number of active containers on the node.
+ */
+ public int getNumContainers() {
+ return numContainers;
+ }
+
+ /**
+ * Get the containers running on the node.
+ * @return A copy of containers running on the node.
+ */
+ public synchronized List getCopiedListOfRunningContainers() {
+ List result = new ArrayList<>(launchedContainers.size());
+ for (ContainerInfo info : launchedContainers.values()) {
+ result.add(info.container);
+ }
+ return result;
+ }
+
+ /**
+ * Get the containers running on the node with AM containers at the end.
+ * @return A copy of running containers with AM containers at the end.
+ */
+ public synchronized List getRunningContainersWithAMsAtTheEnd() {
+ LinkedList result = new LinkedList<>();
+ for (ContainerInfo info : launchedContainers.values()) {
+ if (info.container.isAMContainer()) {
+ result.addLast(info.container);
+ } else {
+ result.addFirst(info.container);
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Get the containers running on the node ordered by which to kill first. It
+ * tries to kill AMs last, then GUARANTEED containers, and it kills
+ * OPPORTUNISTIC first. If the same time, it uses the creation time.
+ * @return A copy of the running containers ordered by which to kill first.
+ */
+ public List getContainersToKill() {
+ List result = getLaunchedContainers();
+ Collections.sort(result, (c1, c2) -> {
+ return new CompareToBuilder()
+ .append(c1.isAMContainer(), c2.isAMContainer())
+ .append(c2.getExecutionType(), c1.getExecutionType()) // reversed
+ .append(c2.getCreationTime(), c1.getCreationTime()) // reversed
+ .toComparison();
+ });
+ return result;
+ }
+
+ /**
+ * Get the launched containers in the node.
+ * @return List of launched containers.
+ */
+ protected synchronized List getLaunchedContainers() {
+ List result = new ArrayList<>();
+ for (ContainerInfo info : launchedContainers.values()) {
+ result.add(info.container);
+ }
+ return result;
+ }
+
+ /**
+ * Get the container for the specified container ID.
+ * @param containerId The container ID
+ * @return The container for the specified container ID
+ */
+ protected synchronized RMContainer getContainer(ContainerId containerId) {
+ RMContainer container = null;
+ ContainerInfo info = launchedContainers.get(containerId);
+ if (info != null) {
+ container = info.container;
+ }
+ return container;
+ }
+
+ /**
+ * Get the reserved container in the node.
+ * @return Reserved container in the node.
+ */
+ public synchronized RMContainer getReservedContainer() {
+ return reservedContainer;
+ }
+
+ /**
+ * Set the reserved container in the node.
+ * @param reservedContainer Reserved container in the node.
+ */
+ public synchronized void
+ setReservedContainer(RMContainer reservedContainer) {
+ this.reservedContainer = reservedContainer;
+ }
+
+ /**
+ * Recover a container.
+ * @param rmContainer Container to recover.
+ */
+ public synchronized void recoverContainer(RMContainer rmContainer) {
+ if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
+ return;
+ }
+ allocateContainer(rmContainer, true);
+ }
+
+ /**
+ * Get the labels for the node.
+ * @return Set of labels for the node.
+ */
+ public Set getLabels() {
+ return labels;
+ }
+
+ /**
+ * Update the labels for the node.
+ * @param labels Set of labels for the node.
+ */
+ public void updateLabels(Set labels) {
+ this.labels = labels;
+ }
+
+ /**
+ * Get partition of which the node belongs to, if node-labels of this node is
+ * empty or null, it belongs to NO_LABEL partition. And since we only support
+ * one partition for each node (YARN-2694), first label will be its partition.
+ * @return Partition for the node.
+ */
+ public String getPartition() {
+ if (this.labels == null || this.labels.isEmpty()) {
+ return RMNodeLabelsManager.NO_LABEL;
+ } else {
+ return this.labels.iterator().next();
+ }
+ }
+
+ /**
+ * Set the resource utilization of the containers in the node.
+ * @param containersUtilization Resource utilization of the containers.
+ */
+ public void setAggregatedContainersUtilization(
+ ResourceUtilization containersUtilization) {
+ this.containersUtilization = containersUtilization;
+ }
+
+ /**
+ * Get the resource utilization of the containers in the node.
+ * @return Resource utilization of the containers.
+ */
+ public ResourceUtilization getAggregatedContainersUtilization() {
+ return this.containersUtilization;
+ }
+
+ /**
+ * Set the resource utilization of the node. This includes the containers.
+ * @param nodeUtilization Resource utilization of the node.
+ */
+ public void setNodeUtilization(ResourceUtilization nodeUtilization) {
+ this.nodeUtilization = nodeUtilization;
+ }
+
+ /**
+ * Get the resource utilization of the node.
+ * @return Resource utilization of the node.
+ */
+ public ResourceUtilization getNodeUtilization() {
+ return this.nodeUtilization;
+ }
+
+ public long getLastHeartbeatMonotonicTime() {
+ return lastHeartbeatMonotonicTime;
+ }
+
+ /**
+ * This will be called for each node heartbeat.
+ */
+ public void notifyNodeUpdate() {
+ this.lastHeartbeatMonotonicTime = Time.monotonicNow();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof SchedulerNode)) {
+ return false;
+ }
+
+ SchedulerNode that = (SchedulerNode) o;
+
+ return getNodeID().equals(that.getNodeID());
+ }
+
+ @Override
+ public int hashCode() {
+ return getNodeID().hashCode();
+ }
+
+ public Set getNodeAttributes() {
+ return nodeAttributes;
+ }
+
+ public void updateNodeAttributes(Set attributes) {
+ this.nodeAttributes = attributes;
+ }
+
+ private static class ContainerInfo {
+ private final RMContainer container;
+ private boolean launchedOnNode;
+
+ public ContainerInfo(RMContainer container, boolean launchedOnNode) {
+ this.container = container;
+ this.launchedOnNode = launchedOnNode;
+ }
+ }
}
diff --git a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
index 4419903402303a8bc814fedb055e219de4188f8c..98d3b4b8974edf0c2ef0169fbdc64296f7254505 100644
--- a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
+++ b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,33 +18,17 @@
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.EnumSet;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Random;
-import java.util.Set;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.atomic.AtomicBoolean;
-
+import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.slf4j.Marker;
-import org.slf4j.MarkerFactory;
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
+import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.SettableFuture;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@@ -78,7 +62,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.placement.PlacementRule;
import org.apache.hadoop.yarn.server.resourcemanager.placement.UserGroupMappingPlacementRule;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData;
-
import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationConstants;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
@@ -106,7 +89,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
-
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerDynamicEditException;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
@@ -143,9 +125,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeLabelsU
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeResourceUpdateSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
-
-import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event
- .QueueManagementChangeEvent;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.QueueManagementChangeEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ReleaseContainerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
@@ -161,10 +141,42 @@ import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.Resources;
+import org.apache.http.HttpEntity;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.util.EntityUtils;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.slf4j.Marker;
+import org.slf4j.MarkerFactory;
-import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
-import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
-import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.SettableFuture;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Date;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Random;
+import java.util.Set;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.stream.Collectors;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.QUEUE_MAPPING;
@@ -172,3178 +184,3415 @@ import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.C
@Evolving
@SuppressWarnings("unchecked")
public class CapacityScheduler extends
- AbstractYarnScheduler implements
- PreemptableResourceScheduler, CapacitySchedulerContext, Configurable,
- ResourceAllocationCommitter, MutableConfScheduler {
+ AbstractYarnScheduler implements
+ PreemptableResourceScheduler, CapacitySchedulerContext, Configurable,
+ ResourceAllocationCommitter, MutableConfScheduler {
+
+ private static final Marker FATAL =
+ MarkerFactory.getMarker("FATAL");
+ private static Logger LOG =
+ LoggerFactory.getLogger(CapacityScheduler.class);
+
+ private CapacitySchedulerQueueManager queueManager;
+
+ private WorkflowPriorityMappingsManager workflowPriorityMappingsMgr;
+
+ // timeout to join when we stop this service
+ protected final long THREAD_JOIN_TIMEOUT_MS = 1000;
+
+ private PreemptionManager preemptionManager = new PreemptionManager();
+
+ private volatile boolean isLazyPreemptionEnabled = false;
+
+ private int offswitchPerHeartbeatLimit;
+
+ private boolean assignMultipleEnabled;
+
+ private int maxAssignPerHeartbeat;
+
+ private CSConfigurationProvider csConfProvider;
+
+ @Override
+ public void setConf(Configuration conf) {
+ yarnConf = conf;
+ }
+
+ private void validateConf(Configuration conf) {
+ // validate scheduler memory allocation setting
+ CapacitySchedulerConfigValidator.validateMemoryAllocation(conf);
+ // validate scheduler vcores allocation setting
+ CapacitySchedulerConfigValidator.validateVCores(conf);
+ }
+
+ @Override
+ public Configuration getConf() {
+ return yarnConf;
+ }
+
+ private CapacitySchedulerConfiguration conf;
+ private Configuration yarnConf;
+
+ private ResourceCalculator calculator;
+ private boolean usePortForNodeName;
+
+ private boolean scheduleAsynchronously;
+ @VisibleForTesting
+ protected List asyncSchedulerThreads;
+ private ResourceCommitterService resourceCommitterService;
+
+ protected LoadsMetricServerRequestThread loadsMetricServerRequestThread;
+ private static final String LOADS_METRIC_SERVER_REQUEST_INTERVAL_MS =
+ CapacitySchedulerConfiguration.PREFIX + "loads-metric-server.request-interval-ms";
+ private static final long DEFAULT_LOADS_METRIC_SERVER_REQUEST_INTERVAL_MS = 1000L;
+ private long loadsMetricServerRequestInterval;
+ private static final String LOADS_METRIC_SERVER_ADDRESS =
+ CapacitySchedulerConfiguration.PREFIX + "loads-metric-server.address";
+ private String loadsMetricServerAddress;
+ private String loadsMetricServerSortUrl;
+ private static final String LOADS_METRIC_SERVER_SORT_PATH = "/scheduler/sortWithLogical";
+ private String loadsMetricServerUploadNodeResourceUrl;
+ private static final String LOADS_METRIC_SERVER_UPLOAD_NODE_RESOURCE_PATH = "/loadStatus/addNodeLogicalResource";
+
+ private double asyncScheduleGlobalAllocatePercent;
+ private static final String ASYNC_SCHEDULE_GLOBAL_ALLOCATE_PERCENT =
+ CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_PREFIX
+ + ".global.allocate-percent";
+ private static final double DEFAULT_ASYNC_SCHEDULE_GLOBAL_ALLOCATE_PERCENT = 100.0d;
+ private RMNodeLabelsManager labelManager;
+ private AppPriorityACLsManager appPriorityACLManager;
+ private boolean multiNodePlacementEnabled;
+
+ private static boolean printedVerboseLoggingForAsyncScheduling = false;
- private static final Marker FATAL =
- MarkerFactory.getMarker("FATAL");
- private static final Logger LOG =
- LoggerFactory.getLogger(CapacityScheduler.class);
+ /**
+ * EXPERT
+ */
+ private long asyncScheduleInterval;
+ private static final String ASYNC_SCHEDULER_INTERVAL =
+ CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_PREFIX
+ + ".scheduling-interval-ms";
+ private static final long DEFAULT_ASYNC_SCHEDULER_INTERVAL = 5;
+ private long asyncMaxPendingBacklogs;
- private CapacitySchedulerQueueManager queueManager;
+ private CSMaxRunningAppsEnforcer maxRunningEnforcer;
- private WorkflowPriorityMappingsManager workflowPriorityMappingsMgr;
+ public CapacityScheduler() {
+ super(CapacityScheduler.class.getName());
+ this.maxRunningEnforcer = new CSMaxRunningAppsEnforcer(this);
+ }
+
+ public CapacityScheduler(String name) {
+ super(name);
+ LOG = LoggerFactory.getLogger(name);
+ this.maxRunningEnforcer = new CSMaxRunningAppsEnforcer(this);
+ }
+
+ public LoadsMetricServerRequestThread getLoadsMetricServerRequestThread() {
+ return loadsMetricServerRequestThread;
+ }
- // timeout to join when we stop this service
- protected final long THREAD_JOIN_TIMEOUT_MS = 1000;
-
- private PreemptionManager preemptionManager = new PreemptionManager();
+ @Override
+ public QueueMetrics getRootQueueMetrics() {
+ return getRootQueue().getMetrics();
+ }
- private volatile boolean isLazyPreemptionEnabled = false;
-
- private int offswitchPerHeartbeatLimit;
+ public CSQueue getRootQueue() {
+ return queueManager.getRootQueue();
+ }
- private boolean assignMultipleEnabled;
-
- private int maxAssignPerHeartbeat;
-
- private CSConfigurationProvider csConfProvider;
-
- @Override
- public void setConf(Configuration conf) {
- yarnConf = conf;
- }
-
- private void validateConf(Configuration conf) {
- // validate scheduler memory allocation setting
- CapacitySchedulerConfigValidator.validateMemoryAllocation(conf);
- // validate scheduler vcores allocation setting
- CapacitySchedulerConfigValidator.validateVCores(conf);
- }
-
- @Override
- public Configuration getConf() {
- return yarnConf;
- }
-
- private CapacitySchedulerConfiguration conf;
- private Configuration yarnConf;
-
- private ResourceCalculator calculator;
- private boolean usePortForNodeName;
-
- private boolean scheduleAsynchronously;
- @VisibleForTesting
- protected List asyncSchedulerThreads;
- private ResourceCommitterService resourceCommitterService;
- private RMNodeLabelsManager labelManager;
- private AppPriorityACLsManager appPriorityACLManager;
- private boolean multiNodePlacementEnabled;
-
- private static boolean printedVerboseLoggingForAsyncScheduling = false;
-
- /**
- * EXPERT
- */
- private long asyncScheduleInterval;
- private static final String ASYNC_SCHEDULER_INTERVAL =
- CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_PREFIX
- + ".scheduling-interval-ms";
- private static final long DEFAULT_ASYNC_SCHEDULER_INTERVAL = 5;
- private long asyncMaxPendingBacklogs;
-
- private CSMaxRunningAppsEnforcer maxRunningEnforcer;
-
- public CapacityScheduler() {
- super(CapacityScheduler.class.getName());
- this.maxRunningEnforcer = new CSMaxRunningAppsEnforcer(this);
- }
-
- @Override
- public QueueMetrics getRootQueueMetrics() {
- return getRootQueue().getMetrics();
- }
-
- public CSQueue getRootQueue() {
- return queueManager.getRootQueue();
- }
-
- @Override
- public CapacitySchedulerConfiguration getConfiguration() {
- return conf;
- }
-
- @Override
- public RMContainerTokenSecretManager getContainerTokenSecretManager() {
- return this.rmContext.getContainerTokenSecretManager();
- }
-
- @Override
- public ResourceCalculator getResourceCalculator() {
- return calculator;
- }
-
- @VisibleForTesting
- public void setResourceCalculator(ResourceCalculator rc) {
- this.calculator = rc;
- }
-
- @Override
- public int getNumClusterNodes() {
- return nodeTracker.nodeCount();
- }
-
- @Override
- public RMContext getRMContext() {
- return this.rmContext;
- }
-
- @Override
- public void setRMContext(RMContext rmContext) {
- this.rmContext = rmContext;
- }
-
- @VisibleForTesting
- void initScheduler(Configuration configuration) throws
- IOException, YarnException {
- writeLock.lock();
- try {
- String confProviderStr = configuration.get(
- YarnConfiguration.SCHEDULER_CONFIGURATION_STORE_CLASS,
- YarnConfiguration.DEFAULT_CONFIGURATION_STORE);
- switch (confProviderStr) {
- case YarnConfiguration.FILE_CONFIGURATION_STORE:
- this.csConfProvider =
- new FileBasedCSConfigurationProvider(rmContext);
- break;
- case YarnConfiguration.MEMORY_CONFIGURATION_STORE:
- case YarnConfiguration.LEVELDB_CONFIGURATION_STORE:
- case YarnConfiguration.ZK_CONFIGURATION_STORE:
- case YarnConfiguration.FS_CONFIGURATION_STORE:
- this.csConfProvider = new MutableCSConfigurationProvider(rmContext);
- break;
- default:
- throw new IOException("Invalid configuration store class: " +
- confProviderStr);
- }
- this.csConfProvider.init(configuration);
- this.conf = this.csConfProvider.loadConfiguration(configuration);
- validateConf(this.conf);
- this.minimumAllocation = super.getMinimumAllocation();
- initMaximumResourceCapability(super.getMaximumAllocation());
- this.calculator = this.conf.getResourceCalculator();
- if (this.calculator instanceof DefaultResourceCalculator
- && ResourceUtils.getNumberOfKnownResourceTypes() > 2) {
- throw new YarnRuntimeException("RM uses DefaultResourceCalculator which"
- + " used only memory as resource-type but invalid resource-types"
- + " specified " + ResourceUtils.getResourceTypes() + ". Use"
- + " DominantResourceCalculator instead to make effective use of"
- + " these resource-types");
- }
- this.usePortForNodeName = this.conf.getUsePortForNodeName();
- this.applications = new ConcurrentHashMap<>();
- this.labelManager = rmContext.getNodeLabelManager();
- this.appPriorityACLManager = new AppPriorityACLsManager(conf);
- this.queueManager = new CapacitySchedulerQueueManager(yarnConf,
- this.labelManager, this.appPriorityACLManager);
- this.queueManager.setCapacitySchedulerContext(this);
-
- this.workflowPriorityMappingsMgr = new WorkflowPriorityMappingsManager();
-
- this.activitiesManager = new ActivitiesManager(rmContext);
- activitiesManager.init(conf);
- initializeQueues(this.conf);
- this.isLazyPreemptionEnabled = conf.getLazyPreemptionEnabled();
-
- scheduleAsynchronously = this.conf.getScheduleAynschronously();
- asyncScheduleInterval = this.conf.getLong(ASYNC_SCHEDULER_INTERVAL,
- DEFAULT_ASYNC_SCHEDULER_INTERVAL);
-
- this.assignMultipleEnabled = this.conf.getAssignMultipleEnabled();
- this.maxAssignPerHeartbeat = this.conf.getMaxAssignPerHeartbeat();
-
- // number of threads for async scheduling
- int maxAsyncSchedulingThreads = this.conf.getInt(
- CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_THREAD,
- 1);
- maxAsyncSchedulingThreads = Math.max(maxAsyncSchedulingThreads, 1);
-
- if (scheduleAsynchronously) {
- asyncSchedulerThreads = new ArrayList<>();
- for (int i = 0; i < maxAsyncSchedulingThreads; i++) {
- asyncSchedulerThreads.add(new AsyncScheduleThread(this));
- }
- resourceCommitterService = new ResourceCommitterService(this);
- asyncMaxPendingBacklogs = this.conf.getInt(
- CapacitySchedulerConfiguration.
- SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS,
- CapacitySchedulerConfiguration.
- DEFAULT_SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS);
- }
-
- // Setup how many containers we can allocate for each round
- offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit();
-
- // Register CS specific multi-node policies to common MultiNodeManager
- // which will add to a MultiNodeSorter which gives a pre-sorted list of
- // nodes to scheduler's allocation.
- multiNodePlacementEnabled = this.conf.getMultiNodePlacementEnabled();
- if(rmContext.getMultiNodeSortingManager() != null) {
- rmContext.getMultiNodeSortingManager().registerMultiNodePolicyNames(
- multiNodePlacementEnabled,
- this.conf.getMultiNodePlacementPolicies());
- }
-
- LOG.info("Initialized CapacityScheduler with " + "calculator="
- + getResourceCalculator().getClass() + ", " + "minimumAllocation=<"
- + getMinimumResourceCapability() + ">, " + "maximumAllocation=<"
- + getMaximumResourceCapability() + ">, " + "asynchronousScheduling="
- + scheduleAsynchronously + ", " + "asyncScheduleInterval="
- + asyncScheduleInterval + "ms" + ",multiNodePlacementEnabled="
- + multiNodePlacementEnabled + ", " + "assignMultipleEnabled="
- + assignMultipleEnabled + ", " + "maxAssignPerHeartbeat="
- + maxAssignPerHeartbeat + ", " + "offswitchPerHeartbeatLimit="
- + offswitchPerHeartbeatLimit);
- } finally {
- writeLock.unlock();
- }
- }
-
- private void startSchedulerThreads() {
- writeLock.lock();
- try {
- activitiesManager.start();
- if (scheduleAsynchronously) {
- Preconditions.checkNotNull(asyncSchedulerThreads,
- "asyncSchedulerThreads is null");
- for (Thread t : asyncSchedulerThreads) {
- t.start();
- }
-
- resourceCommitterService.start();
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public void serviceInit(Configuration conf) throws Exception {
- Configuration configuration = new Configuration(conf);
- super.serviceInit(conf);
- initScheduler(configuration);
- // Initialize SchedulingMonitorManager
- schedulingMonitorManager.initialize(rmContext, conf);
- }
-
- @Override
- public void serviceStart() throws Exception {
- startSchedulerThreads();
- super.serviceStart();
- }
-
- @Override
- public void serviceStop() throws Exception {
- writeLock.lock();
- try {
- this.activitiesManager.stop();
- if (scheduleAsynchronously && asyncSchedulerThreads != null) {
- for (Thread t : asyncSchedulerThreads) {
- t.interrupt();
- t.join(THREAD_JOIN_TIMEOUT_MS);
- }
- resourceCommitterService.interrupt();
- resourceCommitterService.join(THREAD_JOIN_TIMEOUT_MS);
- }
- } finally {
- writeLock.unlock();
- }
-
- if (isConfigurationMutable()) {
- ((MutableConfigurationProvider) csConfProvider).close();
- }
- super.serviceStop();
- }
-
- public void reinitialize(Configuration newConf, RMContext rmContext,
- boolean validation) throws IOException {
- writeLock.lock();
- try {
- Configuration configuration = new Configuration(newConf);
- CapacitySchedulerConfiguration oldConf = this.conf;
- if (validation) {
- this.conf = new CapacitySchedulerConfiguration(newConf, false);
- } else {
- this.conf = csConfProvider.loadConfiguration(configuration);
- }
- validateConf(this.conf);
- try {
- LOG.info("Re-initializing queues...");
- refreshMaximumAllocation(
- ResourceUtils.fetchMaximumAllocationFromConfig(this.conf));
- reinitializeQueues(this.conf);
- } catch (Throwable t) {
- this.conf = oldConf;
- refreshMaximumAllocation(
- ResourceUtils.fetchMaximumAllocationFromConfig(this.conf));
- throw new IOException("Failed to re-init queues : " + t.getMessage(),
- t);
- }
- if (!validation) {
-
- // update lazy preemption
- this.isLazyPreemptionEnabled = this.conf.getLazyPreemptionEnabled();
-
- // Setup how many containers we can allocate for each round
- assignMultipleEnabled = this.conf.getAssignMultipleEnabled();
- maxAssignPerHeartbeat = this.conf.getMaxAssignPerHeartbeat();
- offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit();
-
- LOG.info("assignMultipleEnabled = " + assignMultipleEnabled + "\n" +
- "maxAssignPerHeartbeat = " + maxAssignPerHeartbeat + "\n" +
- "offswitchPerHeartbeatLimit = " + offswitchPerHeartbeatLimit);
-
- super.reinitialize(newConf, rmContext);
- }
- maxRunningEnforcer.updateRunnabilityOnReload();
- } finally {
- writeLock.unlock();
- }
-
- }
-
- @Override
- public void reinitialize(Configuration newConf, RMContext rmContext)
- throws IOException {
- reinitialize(newConf, rmContext, false);
- }
-
- long getAsyncScheduleInterval() {
- return asyncScheduleInterval;
- }
-
- private final static Random random = new Random(System.currentTimeMillis());
-
- private static boolean shouldSkipNodeSchedule(FiCaSchedulerNode node,
- CapacityScheduler cs, boolean printVerboseLog) {
- // Skip node which missed 2 heartbeats since the node might be dead and
- // we should not continue allocate containers on that.
- long timeElapsedFromLastHeartbeat =
- Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
- if (timeElapsedFromLastHeartbeat > cs.nmHeartbeatInterval * 2) {
- if (printVerboseLog && LOG.isDebugEnabled()) {
- LOG.debug("Skip scheduling on node because it haven't heartbeated for "
- + timeElapsedFromLastHeartbeat / 1000.0f + " secs");
- }
- return true;
- }
- return false;
- }
-
- /**
- * Schedule on all nodes by starting at a random point.
- * @param cs
- */
- static void schedule(CapacityScheduler cs) throws InterruptedException{
- // First randomize the start point
- int current = 0;
- Collection nodes = cs.nodeTracker.getAllNodes();
-
- // If nodes size is 0 (when there are no node managers registered,
- // we can return from here itself.
- int nodeSize = nodes.size();
- if(nodeSize == 0) {
- return;
- }
- int start = random.nextInt(nodeSize);
-
- // To avoid too verbose DEBUG logging, only print debug log once for
- // every 10 secs.
- boolean printSkipedNodeLogging = false;
- if (Time.monotonicNow() / 1000 % 10 == 0) {
- printSkipedNodeLogging = (!printedVerboseLoggingForAsyncScheduling);
- } else {
- printedVerboseLoggingForAsyncScheduling = false;
- }
-
- // Allocate containers of node [start, end)
- for (FiCaSchedulerNode node : nodes) {
- if (current++ >= start) {
- if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
- continue;
- }
- cs.allocateContainersToNode(node.getNodeID(), false);
- }
- }
-
- current = 0;
-
- // Allocate containers of node [0, start)
- for (FiCaSchedulerNode node : nodes) {
- if (current++ > start) {
- break;
- }
- if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
- continue;
- }
- cs.allocateContainersToNode(node.getNodeID(), false);
+ @Override
+ public CapacitySchedulerConfiguration getConfiguration() {
+ return conf;
}
- if (printSkipedNodeLogging) {
- printedVerboseLoggingForAsyncScheduling = true;
+ @Override
+ public RMContainerTokenSecretManager getContainerTokenSecretManager() {
+ return this.rmContext.getContainerTokenSecretManager();
}
- Thread.sleep(cs.getAsyncScheduleInterval());
- }
+ @Override
+ public ResourceCalculator getResourceCalculator() {
+ return calculator;
+ }
- static class AsyncScheduleThread extends Thread {
+ @VisibleForTesting
+ public void setResourceCalculator(ResourceCalculator rc) {
+ this.calculator = rc;
+ }
- private final CapacityScheduler cs;
- private AtomicBoolean runSchedules = new AtomicBoolean(false);
+ @Override
+ public int getNumClusterNodes() {
+ return nodeTracker.nodeCount();
+ }
- public AsyncScheduleThread(CapacityScheduler cs) {
- this.cs = cs;
- setDaemon(true);
+ @Override
+ public RMContext getRMContext() {
+ return this.rmContext;
}
@Override
- public void run() {
- int debuggingLogCounter = 0;
- while (!Thread.currentThread().isInterrupted()) {
+ public void setRMContext(RMContext rmContext) {
+ this.rmContext = rmContext;
+ }
+
+ @VisibleForTesting
+ void initScheduler(Configuration configuration) throws
+ IOException, YarnException {
+ writeLock.lock();
try {
- if (!runSchedules.get()) {
- Thread.sleep(100);
- } else {
- // Don't run schedule if we have some pending backlogs already
- if (cs.getAsyncSchedulingPendingBacklogs()
- > cs.asyncMaxPendingBacklogs) {
- Thread.sleep(1);
- } else{
- schedule(cs);
- if(LOG.isDebugEnabled()) {
- // Adding a debug log here to ensure that the thread is alive
- // and running fine.
- if (debuggingLogCounter++ > 10000) {
- debuggingLogCounter = 0;
- LOG.debug("AsyncScheduleThread[" + getName() + "] is running!");
+ String confProviderStr = configuration.get(
+ YarnConfiguration.SCHEDULER_CONFIGURATION_STORE_CLASS,
+ YarnConfiguration.DEFAULT_CONFIGURATION_STORE);
+ switch (confProviderStr) {
+ case YarnConfiguration.FILE_CONFIGURATION_STORE:
+ this.csConfProvider =
+ new FileBasedCSConfigurationProvider(rmContext);
+ break;
+ case YarnConfiguration.MEMORY_CONFIGURATION_STORE:
+ case YarnConfiguration.LEVELDB_CONFIGURATION_STORE:
+ case YarnConfiguration.ZK_CONFIGURATION_STORE:
+ case YarnConfiguration.FS_CONFIGURATION_STORE:
+ this.csConfProvider = new MutableCSConfigurationProvider(rmContext);
+ break;
+ default:
+ throw new IOException("Invalid configuration store class: " +
+ confProviderStr);
+ }
+ this.csConfProvider.init(configuration);
+ this.conf = this.csConfProvider.loadConfiguration(configuration);
+ validateConf(this.conf);
+ this.minimumAllocation = super.getMinimumAllocation();
+ initMaximumResourceCapability(super.getMaximumAllocation());
+ this.calculator = this.conf.getResourceCalculator();
+ if (this.calculator instanceof DefaultResourceCalculator
+ && ResourceUtils.getNumberOfKnownResourceTypes() > 2) {
+ throw new YarnRuntimeException("RM uses DefaultResourceCalculator which"
+ + " used only memory as resource-type but invalid resource-types"
+ + " specified " + ResourceUtils.getResourceTypes() + ". Use"
+ + " DominantResourceCalculator instead to make effective use of"
+ + " these resource-types");
+ }
+ this.usePortForNodeName = this.conf.getUsePortForNodeName();
+ this.applications = new ConcurrentHashMap<>();
+ this.labelManager = rmContext.getNodeLabelManager();
+ this.appPriorityACLManager = new AppPriorityACLsManager(conf);
+ this.queueManager = new CapacitySchedulerQueueManager(yarnConf,
+ this.labelManager, this.appPriorityACLManager);
+ this.queueManager.setCapacitySchedulerContext(this);
+
+ this.workflowPriorityMappingsMgr = new WorkflowPriorityMappingsManager();
+
+ this.activitiesManager = new ActivitiesManager(rmContext);
+ activitiesManager.init(conf);
+ initializeQueues(this.conf);
+ this.isLazyPreemptionEnabled = conf.getLazyPreemptionEnabled();
+
+ scheduleAsynchronously = this.conf.getScheduleAynschronously();
+ asyncScheduleInterval = this.conf.getLong(ASYNC_SCHEDULER_INTERVAL,
+ DEFAULT_ASYNC_SCHEDULER_INTERVAL);
+
+ this.assignMultipleEnabled = this.conf.getAssignMultipleEnabled();
+ this.maxAssignPerHeartbeat = this.conf.getMaxAssignPerHeartbeat();
+
+ // number of threads for async scheduling
+ int maxAsyncSchedulingThreads = this.conf.getInt(
+ CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_THREAD,
+ 1);
+ maxAsyncSchedulingThreads = Math.max(maxAsyncSchedulingThreads, 1);
+
+ if (scheduleAsynchronously) {
+ asyncSchedulerThreads = new ArrayList<>();
+ for (int i = 0; i < maxAsyncSchedulingThreads; i++) {
+ asyncSchedulerThreads.add(new AsyncScheduleThread(this));
}
- }
+ resourceCommitterService = new ResourceCommitterService(this);
+ asyncMaxPendingBacklogs = this.conf.getInt(
+ CapacitySchedulerConfiguration.
+ SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS,
+ CapacitySchedulerConfiguration.
+ DEFAULT_SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS);
+ asyncScheduleGlobalAllocatePercent = this.conf.getDouble(ASYNC_SCHEDULE_GLOBAL_ALLOCATE_PERCENT,
+ DEFAULT_ASYNC_SCHEDULE_GLOBAL_ALLOCATE_PERCENT);
+ }
+
+ loadsMetricServerRequestThread = new LoadsMetricServerRequestThread(this);
+
+ loadsMetricServerAddress = this.conf.get(LOADS_METRIC_SERVER_ADDRESS);
+ if (StringUtils.isEmpty(loadsMetricServerAddress) || !loadsMetricServerAddress.matches("[\\w.]+:\\d+")) {
+ throw new YarnRuntimeException(LOADS_METRIC_SERVER_ADDRESS + " is invalid, eg: server1:9090");
+ }
+ loadsMetricServerSortUrl = String
+ .format("http://%s/%s", loadsMetricServerAddress, LOADS_METRIC_SERVER_SORT_PATH);
+ loadsMetricServerUploadNodeResourceUrl = String
+ .format("http://%s/%s", loadsMetricServerAddress, LOADS_METRIC_SERVER_UPLOAD_NODE_RESOURCE_PATH);
+
+ loadsMetricServerRequestInterval = this.conf.getLong(LOADS_METRIC_SERVER_REQUEST_INTERVAL_MS,
+ DEFAULT_LOADS_METRIC_SERVER_REQUEST_INTERVAL_MS);
+
+ // Setup how many containers we can allocate for each round
+ offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit();
+
+ // Register CS specific multi-node policies to common MultiNodeManager
+ // which will add to a MultiNodeSorter which gives a pre-sorted list of
+ // nodes to scheduler's allocation.
+ multiNodePlacementEnabled = this.conf.getMultiNodePlacementEnabled();
+ if (rmContext.getMultiNodeSortingManager() != null) {
+ rmContext.getMultiNodeSortingManager().registerMultiNodePolicyNames(
+ multiNodePlacementEnabled,
+ this.conf.getMultiNodePlacementPolicies());
}
- }
- } catch (InterruptedException ie) {
- // keep interrupt signal
- Thread.currentThread().interrupt();
+
+ LOG.info("Initialized CapacityScheduler with " + "calculator="
+ + getResourceCalculator().getClass() + ", " + "minimumAllocation=<"
+ + getMinimumResourceCapability() + ">, " + "maximumAllocation=<"
+ + getMaximumResourceCapability() + ">, " + "asynchronousScheduling="
+ + scheduleAsynchronously + ", " + "asyncScheduleInterval="
+ + asyncScheduleInterval + "ms" + ",multiNodePlacementEnabled="
+ + multiNodePlacementEnabled + ", " + "assignMultipleEnabled="
+ + assignMultipleEnabled + ", " + "maxAssignPerHeartbeat="
+ + maxAssignPerHeartbeat + ", " + "offswitchPerHeartbeatLimit="
+ + offswitchPerHeartbeatLimit);
+ } finally {
+ writeLock.unlock();
}
- }
- LOG.info("AsyncScheduleThread[" + getName() + "] exited!");
}
- public void beginSchedule() {
- runSchedules.set(true);
- }
+ private void startSchedulerThreads() {
+ writeLock.lock();
+ try {
+ activitiesManager.start();
+ if (scheduleAsynchronously) {
+ Preconditions.checkNotNull(asyncSchedulerThreads,
+ "asyncSchedulerThreads is null");
+ for (Thread t : asyncSchedulerThreads) {
+ t.start();
+ }
- public void suspendSchedule() {
- runSchedules.set(false);
+ resourceCommitterService.start();
+ }
+ loadsMetricServerRequestThread.start();
+ } finally {
+ writeLock.unlock();
+ }
}
- }
-
- static class ResourceCommitterService extends Thread {
- private final CapacityScheduler cs;
- private BlockingQueue>
- backlogs = new LinkedBlockingQueue<>();
+ @Override
+ public void serviceInit(Configuration conf) throws Exception {
+ Configuration configuration = new Configuration(conf);
+ super.serviceInit(conf);
+ initScheduler(configuration);
+ // Initialize SchedulingMonitorManager
+ schedulingMonitorManager.initialize(rmContext, conf);
+ }
- public ResourceCommitterService(CapacityScheduler cs) {
- this.cs = cs;
- setDaemon(true);
+ @Override
+ public void serviceStart() throws Exception {
+ startSchedulerThreads();
+ super.serviceStart();
}
@Override
- public void run() {
- while (!Thread.currentThread().isInterrupted()) {
+ public void serviceStop() throws Exception {
+ writeLock.lock();
try {
- ResourceCommitRequest request =
- backlogs.take();
- cs.writeLock.lock();
- try {
- cs.tryCommit(cs.getClusterResource(), request, true);
- } finally {
- cs.writeLock.unlock();
- }
-
- } catch (InterruptedException e) {
- LOG.error(e.toString());
- Thread.currentThread().interrupt();
- }
- }
- LOG.info("ResourceCommitterService exited!");
- }
-
- public void addNewCommitRequest(
- ResourceCommitRequest proposal) {
- backlogs.add(proposal);
- }
-
- public int getPendingBacklogs() {
- return backlogs.size();
- }
- }
-
- @VisibleForTesting
- public PlacementRule getUserGroupMappingPlacementRule() throws IOException {
- readLock.lock();
- try {
- UserGroupMappingPlacementRule ugRule = new UserGroupMappingPlacementRule();
- ugRule.initialize(this);
- return ugRule;
- } finally {
- readLock.unlock();
- }
- }
-
- public PlacementRule getAppNameMappingPlacementRule() throws IOException {
- readLock.lock();
- try {
- AppNameMappingPlacementRule anRule = new AppNameMappingPlacementRule();
- anRule.initialize(this);
- return anRule;
- } finally {
- readLock.unlock();
- }
- }
-
- @VisibleForTesting
- public void updatePlacementRules() throws IOException {
- // Initialize placement rules
- Collection placementRuleStrs = conf.getStringCollection(
- YarnConfiguration.QUEUE_PLACEMENT_RULES);
- List placementRules = new ArrayList<>();
- Set distinguishRuleSet = CapacitySchedulerConfigValidator
- .validatePlacementRules(placementRuleStrs);
-
- // add UserGroupMappingPlacementRule if empty,default value of
- // yarn.scheduler.queue-placement-rules is user-group
- if (distinguishRuleSet.isEmpty()) {
- distinguishRuleSet.add(YarnConfiguration.USER_GROUP_PLACEMENT_RULE);
- }
-
- placementRuleStrs = new ArrayList<>(distinguishRuleSet);
-
- for (String placementRuleStr : placementRuleStrs) {
- switch (placementRuleStr) {
- case YarnConfiguration.USER_GROUP_PLACEMENT_RULE:
- PlacementRule ugRule = getUserGroupMappingPlacementRule();
- if (null != ugRule) {
- placementRules.add(ugRule);
- }
- break;
- case YarnConfiguration.APP_NAME_PLACEMENT_RULE:
- PlacementRule anRule = getAppNameMappingPlacementRule();
- if (null != anRule) {
- placementRules.add(anRule);
- }
- break;
- default:
- boolean isMappingNotEmpty;
+ this.activitiesManager.stop();
+ if (scheduleAsynchronously && asyncSchedulerThreads != null) {
+ for (Thread t : asyncSchedulerThreads) {
+ t.interrupt();
+ t.join(THREAD_JOIN_TIMEOUT_MS);
+ }
+ resourceCommitterService.interrupt();
+ resourceCommitterService.join(THREAD_JOIN_TIMEOUT_MS);
+ }
+ } finally {
+ writeLock.unlock();
+ }
+
+ if (isConfigurationMutable()) {
+ ((MutableConfigurationProvider) csConfProvider).close();
+ }
+ super.serviceStop();
+ }
+
+ public void reinitialize(Configuration newConf, RMContext rmContext,
+ boolean validation) throws IOException {
+ writeLock.lock();
try {
- PlacementRule rule = PlacementFactory.getPlacementRule(
- placementRuleStr, conf);
- if (null != rule) {
+ Configuration configuration = new Configuration(newConf);
+ CapacitySchedulerConfiguration oldConf = this.conf;
+ if (validation) {
+ this.conf = new CapacitySchedulerConfiguration(newConf, false);
+ } else {
+ this.conf = csConfProvider.loadConfiguration(configuration);
+ }
+ validateConf(this.conf);
try {
- isMappingNotEmpty = rule.initialize(this);
- } catch (IOException ie) {
- throw new IOException(ie);
- }
- if (isMappingNotEmpty) {
- placementRules.add(rule);
- }
- }
- } catch (ClassNotFoundException cnfe) {
- throw new IOException(cnfe);
- }
- }
- }
-
- rmContext.getQueuePlacementManager().updateRules(placementRules);
- }
-
- @Lock(CapacityScheduler.class)
- private void initializeQueues(CapacitySchedulerConfiguration conf)
- throws YarnException {
- try {
- this.queueManager.initializeQueues(conf);
-
- updatePlacementRules();
-
- this.workflowPriorityMappingsMgr.initialize(this);
-
- // Notify Preemption Manager
- preemptionManager.refreshQueues(null, this.getRootQueue());
- } catch (Exception e) {
- throw new YarnException("Failed to initialize queues", e);
- }
- }
-
- @Lock(CapacityScheduler.class)
- private void reinitializeQueues(CapacitySchedulerConfiguration newConf)
- throws IOException {
- this.queueManager.reinitializeQueues(newConf);
- updatePlacementRules();
-
- this.workflowPriorityMappingsMgr.initialize(this);
-
- // Notify Preemption Manager
- preemptionManager.refreshQueues(null, this.getRootQueue());
- }
-
- @Override
- public CSQueue getQueue(String queueName) {
- if (queueName == null) {
- return null;
- }
- return this.queueManager.getQueue(queueName);
- }
-
- /**
- * Returns the normalized queue name, which should be used for internal
- * queue references. Currently this is the fullQueuename which disambiguously
- * identifies a queue.
- * @param name Name of the queue to be normalized
- * @return The normalized (full name) of the queue
- */
- public String normalizeQueueName(String name) {
- if (this.queueManager == null) {
- return name;
- }
- return this.queueManager.normalizeQueueName(name);
- }
-
- /**
- * Determines if a short queue name reference is ambiguous, if there are at
- * least two queues with the same name, it is considered ambiguous. Otherwise
- * it is not.
- * @param queueName The name of the queue to check for ambiguity
- * @return true if there are at least 2 queues with the same name
- */
- public boolean isAmbiguous(String queueName) {
- return this.queueManager.isAmbiguous(queueName);
- }
-
- private void addApplicationOnRecovery(ApplicationId applicationId,
- String queueName, String user,
- Priority priority, ApplicationPlacementContext placementContext) {
- writeLock.lock();
- try {
- //check if the queue needs to be auto-created during recovery
- CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user,
- queueName, placementContext, true);
-
- if (queue == null) {
- //During a restart, this indicates a queue was removed, which is
- //not presently supported
- if (!getConfiguration().shouldAppFailFast(getConfig())) {
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.KILL,
- "Application killed on recovery as it"
- + " was submitted to queue " + queueName
- + " which no longer exists after restart."));
- return;
- } else{
- String queueErrorMsg = "Queue named " + queueName + " missing "
- + "during application recovery."
- + " Queue removal during recovery is not presently "
- + "supported by the capacity scheduler, please "
- + "restart with all queues configured"
- + " which were present before shutdown/restart.";
- LOG.error(FATAL, queueErrorMsg);
- throw new QueueInvalidException(queueErrorMsg);
- }
- }
- if (!(queue instanceof LeafQueue)) {
- // During RM restart, this means leaf queue was converted to a parent
- // queue, which is not supported for running apps.
- if (!getConfiguration().shouldAppFailFast(getConfig())) {
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.KILL,
- "Application killed on recovery as it was "
- + "submitted to queue " + queueName
- + " which is no longer a leaf queue after restart."));
- return;
- } else{
- String queueErrorMsg = "Queue named " + queueName
- + " is no longer a leaf queue during application recovery."
- + " Changing a leaf queue to a parent queue during recovery is"
- + " not presently supported by the capacity scheduler. Please"
- + " restart with leaf queues before shutdown/restart continuing"
- + " as leaf queues.";
- LOG.error(FATAL, queueErrorMsg);
- throw new QueueInvalidException(queueErrorMsg);
- }
- }
- // When recovering apps in this queue but queue is in STOPPED state,
- // that means its previous state was DRAINING. So we auto transit
- // the state to DRAINING for recovery.
- if (queue.getState() == QueueState.STOPPED) {
- ((LeafQueue) queue).recoverDrainingState();
- }
- // Submit to the queue
- try {
- queue.submitApplication(applicationId, user, queueName);
- } catch (AccessControlException ace) {
- // Ignore the exception for recovered app as the app was previously
- // accepted.
- LOG.warn("AccessControlException received when trying to recover "
- + applicationId + " in queue " + queueName + " for user " + user
- + ". Since the app was in the queue prior to recovery, the Capacity"
- + " Scheduler will recover the app anyway.", ace);
- }
- queue.getMetrics().submitApp(user);
- SchedulerApplication application =
- new SchedulerApplication(queue, user, priority);
- applications.put(applicationId, application);
- LOG.info("Accepted application " + applicationId + " from user: " + user
- + ", in queue: " + queueName);
- LOG.debug(
- applicationId + " is recovering. Skip notifying APP_ACCEPTED");
- } finally {
- writeLock.unlock();
- }
- }
-
- private CSQueue getOrCreateQueueFromPlacementContext(ApplicationId
- applicationId, String user, String queueName,
- ApplicationPlacementContext placementContext,
- boolean isRecovery) {
-
- CSQueue queue = getQueue(queueName);
-
- if (queue == null) {
- if (placementContext != null && placementContext.hasParentQueue()) {
- try {
- return autoCreateLeafQueue(placementContext);
- } catch (YarnException | IOException e) {
- if (isRecovery) {
- if (!getConfiguration().shouldAppFailFast(getConfig())) {
- LOG.error("Could not auto-create leaf queue " + queueName +
- " due to : ", e);
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.KILL,
- "Application killed on recovery"
- + " as it was submitted to queue " + queueName
- + " which could not be auto-created"));
- } else{
- String queueErrorMsg =
- "Queue named " + queueName + " could not be "
- + "auto-created during application recovery.";
- LOG.error(FATAL, queueErrorMsg, e);
- throw new QueueInvalidException(queueErrorMsg);
- }
- } else{
- LOG.error("Could not auto-create leaf queue due to : ", e);
- final String message =
- "Application " + applicationId + " submission by user : "
- + user
- + " to queue : " + queueName + " failed : " + e
- .getMessage();
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- }
- }
- }
- }
- return queue;
- }
-
- private void addApplication(ApplicationId applicationId, String queueName,
- String user, Priority priority,
- ApplicationPlacementContext placementContext) {
- writeLock.lock();
- try {
- if (isSystemAppsLimitReached()) {
- String message = "Maximum system application limit reached,"
- + "cannot accept submission of application: " + applicationId;
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return;
- }
-
- //Could be a potential auto-created leaf queue
- CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user,
- queueName, placementContext, false);
-
- if (queue == null) {
- String message;
- if (isAmbiguous(queueName)) {
- message = "Application " + applicationId
- + " submitted by user " + user
- + " to ambiguous queue: " + queueName
- + " please use full queue path instead.";
- } else {
- message =
- "Application " + applicationId + " submitted by user " + user
- + " to unknown queue: " + queueName;
- }
-
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return;
- }
-
- if (!(queue instanceof LeafQueue)) {
- String message =
- "Application " + applicationId + " submitted by user : " + user
- + " to non-leaf queue : " + queueName;
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return;
- } else if (queue instanceof AutoCreatedLeafQueue && queue
- .getParent() instanceof ManagedParentQueue) {
-
- //If queue already exists and auto-queue creation was not required,
- //placement context should not be null
- if (placementContext == null) {
- String message =
- "Application " + applicationId + " submission by user : " + user
- + " to specified queue : " + queueName + " is prohibited. "
- + "Verify automatic queue mapping for user exists in " +
- QUEUE_MAPPING;
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return;
- // For a queue which exists already and
- // not auto-created above, then its parent queue should match
- // the parent queue specified in queue mapping
- } else if (!queue.getParent().getQueueShortName().equals(
- placementContext.getParentQueue())
- && !queue.getParent().getQueuePath().equals(
- placementContext.getParentQueue())) {
- String message =
- "Auto created Leaf queue " + placementContext.getQueue() + " "
- + "already exists under queue : " + queue
- .getParent().getQueueShortName()
- + ". But Queue mapping configuration " +
- CapacitySchedulerConfiguration.QUEUE_MAPPING + " has been "
- + "updated to a different parent queue : "
- + placementContext.getParentQueue()
- + " for the specified user : " + user;
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return;
- }
- }
-
- try {
- priority = workflowPriorityMappingsMgr.mapWorkflowPriorityForApp(
- applicationId, queue, user, priority);
- } catch (YarnException e) {
- String message = "Failed to submit application " + applicationId +
- " submitted by user " + user + " reason: " + e.getMessage();
- this.rmContext.getDispatcher().getEventHandler().handle(new RMAppEvent(
- applicationId, RMAppEventType.APP_REJECTED, message));
- return;
- }
-
- // Submit to the queue
- try {
- queue.submitApplication(applicationId, user, queueName);
- } catch (AccessControlException ace) {
- LOG.info("Failed to submit application " + applicationId + " to queue "
- + queueName + " from user " + user, ace);
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- ace.toString()));
- return;
- }
- // update the metrics
- queue.getMetrics().submitApp(user);
- SchedulerApplication application =
- new SchedulerApplication(queue, user, priority);
- applications.put(applicationId, application);
- LOG.info("Accepted application " + applicationId + " from user: " + user
- + ", in queue: " + queueName);
- rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED));
- } finally {
- writeLock.unlock();
- }
- }
-
- private void addApplicationAttempt(
- ApplicationAttemptId applicationAttemptId,
- boolean transferStateFromPreviousAttempt,
- boolean isAttemptRecovering) {
- writeLock.lock();
- try {
- SchedulerApplication application = applications.get(
- applicationAttemptId.getApplicationId());
- if (application == null) {
- LOG.warn("Application " + applicationAttemptId.getApplicationId()
- + " cannot be found in scheduler.");
- return;
- }
- CSQueue queue = (CSQueue) application.getQueue();
-
- FiCaSchedulerApp attempt = new FiCaSchedulerApp(applicationAttemptId,
- application.getUser(), queue, queue.getAbstractUsersManager(),
- rmContext, application.getPriority(), isAttemptRecovering,
- activitiesManager);
- if (transferStateFromPreviousAttempt) {
- attempt.transferStateFromPreviousAttempt(
- application.getCurrentAppAttempt());
- }
- application.setCurrentAppAttempt(attempt);
-
- // Update attempt priority to the latest to avoid race condition i.e
- // SchedulerApplicationAttempt is created with old priority but it is not
- // set to SchedulerApplication#setCurrentAppAttempt.
- // Scenario would occur is
- // 1. SchdulerApplicationAttempt is created with old priority.
- // 2. updateApplicationPriority() updates SchedulerApplication. Since
- // currentAttempt is null, it just return.
- // 3. ScheduelerApplcationAttempt is set in
- // SchedulerApplication#setCurrentAppAttempt.
- attempt.setPriority(application.getPriority());
-
- maxRunningEnforcer.checkRunnabilityWithUpdate(attempt);
- maxRunningEnforcer.trackApp(attempt);
-
- queue.submitApplicationAttempt(attempt, application.getUser());
- LOG.info("Added Application Attempt " + applicationAttemptId
- + " to scheduler from user " + application.getUser() + " in queue "
- + queue.getQueuePath());
- if (isAttemptRecovering) {
- LOG.debug("{} is recovering. Skipping notifying ATTEMPT_ADDED",
- applicationAttemptId);
- } else{
- rmContext.getDispatcher().getEventHandler().handle(
- new RMAppAttemptEvent(applicationAttemptId,
- RMAppAttemptEventType.ATTEMPT_ADDED));
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- private void doneApplication(ApplicationId applicationId,
- RMAppState finalState) {
- writeLock.lock();
- try {
- SchedulerApplication application = applications.get(
- applicationId);
- if (application == null) {
- // The AppRemovedSchedulerEvent maybe sent on recovery for completed
- // apps, ignore it.
- LOG.warn("Couldn't find application " + applicationId);
- return;
- }
- CSQueue queue = (CSQueue) application.getQueue();
- if (!(queue instanceof LeafQueue)) {
- LOG.error("Cannot finish application " + "from non-leaf queue: " + queue
- .getQueuePath());
- } else{
- queue.finishApplication(applicationId, application.getUser());
- }
- application.stop(finalState);
- applications.remove(applicationId);
- } finally {
- writeLock.unlock();
- }
- }
-
- private void doneApplicationAttempt(
- ApplicationAttemptId applicationAttemptId,
- RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers) {
- writeLock.lock();
- try {
- LOG.info("Application Attempt " + applicationAttemptId + " is done."
- + " finalState=" + rmAppAttemptFinalState);
-
- FiCaSchedulerApp attempt = getApplicationAttempt(applicationAttemptId);
- SchedulerApplication application = applications.get(
- applicationAttemptId.getApplicationId());
-
- if (application == null || attempt == null) {
- LOG.info(
- "Unknown application " + applicationAttemptId + " has completed!");
- return;
- }
-
- // Release all the allocated, acquired, running containers
- for (RMContainer rmContainer : attempt.getLiveContainers()) {
- if (keepContainers && rmContainer.getState().equals(
- RMContainerState.RUNNING)) {
- // do not kill the running container in the case of work-preserving AM
- // restart.
- LOG.info("Skip killing " + rmContainer.getContainerId());
- continue;
- }
- super.completedContainer(rmContainer, SchedulerUtils
- .createAbnormalContainerStatus(rmContainer.getContainerId(),
- SchedulerUtils.COMPLETED_APPLICATION),
- RMContainerEventType.KILL);
- }
-
- // Release all reserved containers
- for (RMContainer rmContainer : attempt.getReservedContainers()) {
- super.completedContainer(rmContainer, SchedulerUtils
- .createAbnormalContainerStatus(rmContainer.getContainerId(),
- "Application Complete"), RMContainerEventType.KILL);
- }
-
- // Clean up pending requests, metrics etc.
- attempt.stop(rmAppAttemptFinalState);
-
- // Inform the queue
- Queue queue = attempt.getQueue();
- CSQueue csQueue = (CSQueue) queue;
- if (!(csQueue instanceof LeafQueue)) {
- LOG.error(
- "Cannot finish application " + "from non-leaf queue: "
- + csQueue.getQueuePath());
- } else {
- csQueue.finishApplicationAttempt(attempt, csQueue.getQueuePath());
-
- maxRunningEnforcer.untrackApp(attempt);
- if (attempt.isRunnable()) {
- maxRunningEnforcer.updateRunnabilityOnAppRemoval(attempt);
- }
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- /**
- * Normalize a list of SchedulingRequest.
- *
- * @param asks scheduling request
- */
- private void normalizeSchedulingRequests(List asks) {
- if (asks == null) {
- return;
- }
- Resource maxAllocation = getMaximumResourceCapability();
- for (SchedulingRequest ask: asks) {
- ResourceSizing sizing = ask.getResourceSizing();
- if (sizing != null && sizing.getResources() != null) {
- sizing.setResources(
- getNormalizedResource(sizing.getResources(), maxAllocation));
- }
- }
- }
-
- @Override
- @Lock(Lock.NoLock.class)
- public Allocation allocate(ApplicationAttemptId applicationAttemptId,
- List ask, List schedulingRequests,
- List release, List blacklistAdditions,
- List blacklistRemovals, ContainerUpdates updateRequests) {
- FiCaSchedulerApp application = getApplicationAttempt(applicationAttemptId);
- if (application == null) {
- LOG.error("Calling allocate on removed or non existent application " +
- applicationAttemptId.getApplicationId());
- return EMPTY_ALLOCATION;
- }
-
- // The allocate may be the leftover from previous attempt, and it will
- // impact current attempt, such as confuse the request and allocation for
- // current attempt's AM container.
- // Note outside precondition check for the attempt id may be
- // outdated here, so double check it here is necessary.
- if (!application.getApplicationAttemptId().equals(applicationAttemptId)) {
- LOG.error("Calling allocate on previous or removed " +
- "or non existent application attempt " + applicationAttemptId);
- return EMPTY_ALLOCATION;
- }
-
- // Handle all container updates
- handleContainerUpdates(application, updateRequests);
-
- // Release containers
- releaseContainers(release, application);
-
- LeafQueue updateDemandForQueue = null;
-
- // Sanity check for new allocation requests
- normalizeResourceRequests(ask);
-
- // Normalize scheduling requests
- normalizeSchedulingRequests(schedulingRequests);
-
- Allocation allocation;
-
- // make sure we aren't stopping/removing the application
- // when the allocate comes in
- application.getWriteLock().lock();
- try {
- if (application.isStopped()) {
- return EMPTY_ALLOCATION;
- }
-
- // Process resource requests
- if (!ask.isEmpty() || (schedulingRequests != null && !schedulingRequests
- .isEmpty())) {
- if (LOG.isDebugEnabled()) {
- LOG.debug(
- "allocate: pre-update " + applicationAttemptId + " ask size ="
- + ask.size());
- application.showRequests();
- }
-
- // Update application requests
- if (application.updateResourceRequests(ask) || application
- .updateSchedulingRequests(schedulingRequests)) {
- updateDemandForQueue = (LeafQueue) application.getQueue();
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug("allocate: post-update");
- application.showRequests();
- }
- }
-
- application.updateBlacklist(blacklistAdditions, blacklistRemovals);
-
- allocation = application.getAllocation(getResourceCalculator(),
- getClusterResource(), getMinimumResourceCapability());
- } finally {
- application.getWriteLock().unlock();
- }
-
- if (updateDemandForQueue != null && !application
- .isWaitingForAMContainer()) {
- updateDemandForQueue.getOrderingPolicy().demandUpdated(application);
- }
-
- LOG.debug("Allocation for application {} : {} with cluster resource : {}",
- applicationAttemptId, allocation, getClusterResource());
- return allocation;
- }
-
- @Override
- @Lock(Lock.NoLock.class)
- public QueueInfo getQueueInfo(String queueName,
- boolean includeChildQueues, boolean recursive)
- throws IOException {
- CSQueue queue = null;
- queue = this.getQueue(queueName);
- if (queue == null) {
- if (isAmbiguous(queueName)) {
- throw new IOException("Ambiguous queue reference: " + queueName
- + " please use full queue path instead.");
- } else {
- throw new IOException("Unknown queue: " + queueName);
- }
-
- }
- return queue.getQueueInfo(includeChildQueues, recursive);
- }
-
- @Override
- @Lock(Lock.NoLock.class)
- public List getQueueUserAclInfo() {
- UserGroupInformation user = null;
- try {
- user = UserGroupInformation.getCurrentUser();
- } catch (IOException ioe) {
- // should never happen
- return new ArrayList();
- }
-
- return getRootQueue().getQueueUserAclInfo(user);
- }
-
- @Override
- protected void nodeUpdate(RMNode rmNode) {
- long begin = System.nanoTime();
- readLock.lock();
- try {
- setLastNodeUpdateTime(Time.now());
- super.nodeUpdate(rmNode);
- } finally {
- readLock.unlock();
- }
-
- // Try to do scheduling
- if (!scheduleAsynchronously) {
- writeLock.lock();
- try {
- // reset allocation and reservation stats before we start doing any
- // work
- updateSchedulerHealth(lastNodeUpdateTime, rmNode.getNodeID(),
- CSAssignment.NULL_ASSIGNMENT);
-
- allocateContainersToNode(rmNode.getNodeID(), true);
- } finally {
- writeLock.unlock();
- }
- }
-
- long latency = System.nanoTime() - begin;
- CapacitySchedulerMetrics.getMetrics().addNodeUpdate(latency);
- }
-
- /**
- * Process resource update on a node.
- */
- private void updateNodeAndQueueResource(RMNode nm,
- ResourceOption resourceOption) {
- writeLock.lock();
- try {
- updateNodeResource(nm, resourceOption);
- Resource clusterResource = getClusterResource();
- getRootQueue().updateClusterResource(clusterResource,
- new ResourceLimits(clusterResource));
- } finally {
- writeLock.unlock();
- }
- }
-
- /**
- * Process node labels update on a node.
- */
- private void updateLabelsOnNode(NodeId nodeId,
- Set newLabels) {
- FiCaSchedulerNode node = nodeTracker.getNode(nodeId);
- if (null == node) {
- return;
- }
-
- // Get new partition, we have only one partition per node
- String newPartition;
- if (newLabels.isEmpty()) {
- newPartition = RMNodeLabelsManager.NO_LABEL;
- } else{
- newPartition = newLabels.iterator().next();
- }
-
- // old partition as well
- String oldPartition = node.getPartition();
-
- // Update resources of these containers
- for (RMContainer rmContainer : node.getCopiedListOfRunningContainers()) {
- FiCaSchedulerApp application = getApplicationAttempt(
- rmContainer.getApplicationAttemptId());
- if (null != application) {
- application.nodePartitionUpdated(rmContainer, oldPartition,
- newPartition);
- } else{
- LOG.warn("There's something wrong, some RMContainers running on"
- + " a node, but we cannot find SchedulerApplicationAttempt "
- + "for it. Node=" + node.getNodeID() + " applicationAttemptId="
- + rmContainer.getApplicationAttemptId());
- continue;
- }
- }
-
- // Unreserve container on this node
- RMContainer reservedContainer = node.getReservedContainer();
- if (null != reservedContainer) {
- killReservedContainer(reservedContainer);
- }
-
- // Update node labels after we've done this
- node.updateLabels(newLabels);
- }
-
- private void updateSchedulerHealth(long now, NodeId nodeId,
- CSAssignment assignment) {
- List allocations =
- assignment.getAssignmentInformation().getAllocationDetails();
- List reservations =
- assignment.getAssignmentInformation().getReservationDetails();
- // Get nodeId from allocated container if incoming argument is null.
- NodeId updatedNodeid = (nodeId == null)
- ? allocations.get(allocations.size() - 1).rmContainer.getNodeId()
- : nodeId;
-
- if (!allocations.isEmpty()) {
- ContainerId allocatedContainerId =
- allocations.get(allocations.size() - 1).containerId;
- String allocatedQueue = allocations.get(allocations.size() - 1).queue;
- schedulerHealth.updateAllocation(now, updatedNodeid, allocatedContainerId,
- allocatedQueue);
- }
- if (!reservations.isEmpty()) {
- ContainerId reservedContainerId =
- reservations.get(reservations.size() - 1).containerId;
- String reservedQueue = reservations.get(reservations.size() - 1).queue;
- schedulerHealth.updateReservation(now, updatedNodeid, reservedContainerId,
- reservedQueue);
- }
- schedulerHealth.updateSchedulerReservationCounts(assignment
- .getAssignmentInformation().getNumReservations());
- schedulerHealth.updateSchedulerAllocationCounts(assignment
- .getAssignmentInformation().getNumAllocations());
- schedulerHealth.updateSchedulerRunDetails(now, assignment
- .getAssignmentInformation().getAllocated(), assignment
- .getAssignmentInformation().getReserved());
- }
-
- private boolean canAllocateMore(CSAssignment assignment, int offswitchCount,
- int assignedContainers) {
- // Current assignment shouldn't be empty
- if (assignment == null
- || Resources.equals(assignment.getResource(), Resources.none())) {
- return false;
- }
-
- // offswitch assignment should be under threshold
- if (offswitchCount >= offswitchPerHeartbeatLimit) {
- return false;
- }
-
- // And it should not be a reserved container
- if (assignment.getAssignmentInformation().getNumReservations() > 0) {
- return false;
- }
-
- // assignMultipleEnabled should be ON,
- // and assignedContainers should be under threshold
- return assignMultipleEnabled
- && (maxAssignPerHeartbeat == -1
- || assignedContainers < maxAssignPerHeartbeat);
- }
-
- private CandidateNodeSet getCandidateNodeSet(
- FiCaSchedulerNode node) {
- CandidateNodeSet candidates = null;
- candidates = new SimpleCandidateNodeSet<>(node);
- if (multiNodePlacementEnabled) {
- Map nodesByPartition = new HashMap<>();
- List nodes = nodeTracker
- .getNodesPerPartition(node.getPartition());
- if (nodes != null && !nodes.isEmpty()) {
- nodes.forEach(n -> nodesByPartition.put(n.getNodeID(), n));
- candidates = new SimpleCandidateNodeSet(
- nodesByPartition, node.getPartition());
- }
- }
- return candidates;
- }
-
- /**
- * We need to make sure when doing allocation, Node should be existed
- * And we will construct a {@link CandidateNodeSet} before proceeding
- */
- private void allocateContainersToNode(NodeId nodeId,
- boolean withNodeHeartbeat) {
- FiCaSchedulerNode node = getNode(nodeId);
- if (null != node) {
- int offswitchCount = 0;
- int assignedContainers = 0;
-
- CandidateNodeSet candidates = getCandidateNodeSet(
- node);
- CSAssignment assignment = allocateContainersToNode(candidates,
- withNodeHeartbeat);
- // Only check if we can allocate more container on the same node when
- // scheduling is triggered by node heartbeat
- if (null != assignment && withNodeHeartbeat) {
- if (assignment.getType() == NodeType.OFF_SWITCH) {
- offswitchCount++;
+ LOG.info("Re-initializing queues...");
+ refreshMaximumAllocation(
+ ResourceUtils.fetchMaximumAllocationFromConfig(this.conf));
+ reinitializeQueues(this.conf);
+ } catch (Throwable t) {
+ this.conf = oldConf;
+ refreshMaximumAllocation(
+ ResourceUtils.fetchMaximumAllocationFromConfig(this.conf));
+ throw new IOException("Failed to re-init queues : " + t.getMessage(),
+ t);
+ }
+ if (!validation) {
+
+ // update lazy preemption
+ this.isLazyPreemptionEnabled = this.conf.getLazyPreemptionEnabled();
+
+ // Setup how many containers we can allocate for each round
+ assignMultipleEnabled = this.conf.getAssignMultipleEnabled();
+ maxAssignPerHeartbeat = this.conf.getMaxAssignPerHeartbeat();
+ offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit();
+
+ LOG.info("assignMultipleEnabled = " + assignMultipleEnabled + "\n" +
+ "maxAssignPerHeartbeat = " + maxAssignPerHeartbeat + "\n" +
+ "offswitchPerHeartbeatLimit = " + offswitchPerHeartbeatLimit);
+
+ super.reinitialize(newConf, rmContext);
+ }
+ maxRunningEnforcer.updateRunnabilityOnReload();
+ } finally {
+ writeLock.unlock();
}
- if (Resources.greaterThan(calculator, getClusterResource(),
- assignment.getResource(), Resources.none())) {
- assignedContainers++;
+ }
+
+ @Override
+ public void reinitialize(Configuration newConf, RMContext rmContext)
+ throws IOException {
+ reinitialize(newConf, rmContext, false);
+ }
+
+ long getAsyncScheduleInterval() {
+ return asyncScheduleInterval;
+ }
+
+ private final static Random random = new Random(System.currentTimeMillis());
+
+ private static boolean shouldSkipNodeSchedule(FiCaSchedulerNode node,
+ CapacityScheduler cs, boolean printVerboseLog) {
+ // Skip node which missed 2 heartbeats since the node might be dead and
+ // we should not continue allocate containers on that.
+ long timeElapsedFromLastHeartbeat =
+ Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
+ if (timeElapsedFromLastHeartbeat > cs.nmHeartbeatInterval * 2) {
+ if (printVerboseLog && LOG.isDebugEnabled()) {
+ LOG.debug("Skip scheduling on node because it haven't heartbeated for "
+ + timeElapsedFromLastHeartbeat / 1000.0f + " secs");
+ }
+ return true;
}
+ return false;
+ }
- while (canAllocateMore(assignment, offswitchCount,
- assignedContainers)) {
- // Try to see if it is possible to allocate multiple container for
- // the same node heartbeat
- assignment = allocateContainersToNode(candidates, true);
+ /**
+ * Schedule on all nodes by global sorted.
+ *
+ * @param cs CapacityScheduler
+ * @param sortedNodesHost List
+ */
+ static void scheduleLoadBased(CapacityScheduler cs,
+ List sortedNodesHost) throws InterruptedException {
+ Collection nodes = cs.nodeTracker.getAllNodes();
+ Map nodeMap = new HashMap<>();
+ for (FiCaSchedulerNode node : nodes) {
+ nodeMap.put(node.getNodeName(), node);
+ }
- if (null != assignment
- && assignment.getType() == NodeType.OFF_SWITCH) {
- offswitchCount++;
- }
+ // To avoid too verbose DEBUG logging, only print debug log once for
+ // every 10 secs.
+ boolean printSkipedNodeLogging = false;
+ if (Time.monotonicNow() / 1000 % 10 == 0) {
+ printSkipedNodeLogging = (!printedVerboseLoggingForAsyncScheduling);
+ } else {
+ printedVerboseLoggingForAsyncScheduling = false;
+ }
- if (null != assignment
- && Resources.greaterThan(calculator, getClusterResource(),
- assignment.getResource(), Resources.none())) {
- assignedContainers++;
- }
+ // Allocate containers of node global sorted
+ int allocateNodesLimit = (int) Math.ceil(cs.asyncScheduleGlobalAllocatePercent * nodes.size() / 100);
+ for (String nodeHost : sortedNodesHost) {
+ if (!nodeMap.containsKey(nodeHost)) {
+ LOG.warn("nodeHost {} not found FiCaSchedulerNode", nodeHost);
+ continue;
+ }
+ FiCaSchedulerNode node = nodeMap.get(nodeHost);
+ if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
+ continue;
+ }
+ cs.allocateContainersToNode(node.getNodeID(), false);
+ allocateNodesLimit -= 1;
+ if (allocateNodesLimit <= 0) {
+ break;
+ }
}
+ Thread.sleep(cs.getAsyncScheduleInterval());
+ }
+
+ /**
+ * Schedule on all nodes by starting at a random point.
+ * @param cs
+ */
+ static void schedule(CapacityScheduler cs) throws InterruptedException {
+ // First randomize the start point
+ int current = 0;
+ Collection nodes = cs.nodeTracker.getAllNodes();
+
+ // If nodes size is 0 (when there are no node managers registered,
+ // we can return from here itself.
+ int nodeSize = nodes.size();
+ if (nodeSize == 0) {
+ return;
+ }
+ int start = random.nextInt(nodeSize);
- if (offswitchCount >= offswitchPerHeartbeatLimit) {
- LOG.debug("Assigned maximum number of off-switch containers: {},"
- + " assignments so far: {}", offswitchCount, assignment);
- }
- }
- }
- }
-
- /*
- * Logics of allocate container on a single node (Old behavior)
- */
- private CSAssignment allocateContainerOnSingleNode(
- CandidateNodeSet candidates, FiCaSchedulerNode node,
- boolean withNodeHeartbeat) {
- LOG.debug("Trying to schedule on node: {}, available: {}",
- node.getNodeName(), node.getUnallocatedResource());
-
- // Backward compatible way to make sure previous behavior which allocation
- // driven by node heartbeat works.
- if (getNode(node.getNodeID()) != node) {
- LOG.error("Trying to schedule on a removed node, please double check, "
- + "nodeId=" + node.getNodeID());
- ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
- "", getRootQueue().getQueuePath(), ActivityState.REJECTED,
- ActivityDiagnosticConstant.INIT_CHECK_SINGLE_NODE_REMOVED);
- ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
- node);
- return null;
- }
-
- // Assign new containers...
- // 1. Check for reserved applications
- // 2. Schedule if there are no reservations
- RMContainer reservedContainer = node.getReservedContainer();
- if (reservedContainer != null) {
- allocateFromReservedContainer(node, withNodeHeartbeat, reservedContainer);
- // Do not schedule if there are any reservations to fulfill on the node
- LOG.debug("Skipping scheduling since node {} is reserved by"
- + " application {}", node.getNodeID(), reservedContainer.
- getContainerId().getApplicationAttemptId());
- return null;
- }
-
- // First check if we can schedule
- // When this time look at one node only, try schedule if the node
- // has any available or killable resource
- if (calculator.computeAvailableContainers(Resources
- .add(node.getUnallocatedResource(), node.getTotalKillableResources()),
- minimumAllocation) <= 0) {
- LOG.debug("This node " + node.getNodeID() + " doesn't have sufficient "
- + "available or preemptible resource for minimum allocation");
- ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
- "", getRootQueue().getQueuePath(), ActivityState.REJECTED,
- ActivityDiagnosticConstant.
- INIT_CHECK_SINGLE_NODE_RESOURCE_INSUFFICIENT);
- ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
- node);
- return null;
- }
-
- return allocateOrReserveNewContainers(candidates, withNodeHeartbeat);
- }
-
- private void allocateFromReservedContainer(FiCaSchedulerNode node,
- boolean withNodeHeartbeat, RMContainer reservedContainer) {
- FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer(
- reservedContainer.getContainerId());
- if (reservedApplication == null) {
- LOG.error(
- "Trying to schedule for a finished app, please double check. nodeId="
- + node.getNodeID() + " container=" + reservedContainer
- .getContainerId());
- return;
- }
-
- // Try to fulfill the reservation
- LOG.debug("Trying to fulfill reservation for application {} on node: {}",
- reservedApplication.getApplicationId(), node.getNodeID());
-
- LeafQueue queue = ((LeafQueue) reservedApplication.getQueue());
- CSAssignment assignment = queue.assignContainers(getClusterResource(),
- new SimpleCandidateNodeSet<>(node),
- // TODO, now we only consider limits for parent for non-labeled
- // resources, should consider labeled resources as well.
- new ResourceLimits(labelManager
- .getResourceByLabel(RMNodeLabelsManager.NO_LABEL,
- getClusterResource())),
- SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
-
- if (assignment.isFulfilledReservation()) {
- if (withNodeHeartbeat) {
- // Only update SchedulerHealth in sync scheduling, existing
- // Data structure of SchedulerHealth need to be updated for
- // Async mode
- updateSchedulerHealth(lastNodeUpdateTime, node.getNodeID(),
- assignment);
- }
-
- schedulerHealth.updateSchedulerFulfilledReservationCounts(1);
-
- ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
- queue.getParent().getQueuePath(), queue.getQueuePath(),
- ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY);
- ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager,
- node, reservedContainer.getContainerId(),
- AllocationState.ALLOCATED_FROM_RESERVED);
- } else if (assignment.getAssignmentInformation().getNumReservations() > 0) {
- ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
- queue.getParent().getQueuePath(), queue.getQueuePath(),
- ActivityState.RE_RESERVED, ActivityDiagnosticConstant.EMPTY);
- ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager,
- node, reservedContainer.getContainerId(), AllocationState.RESERVED);
- }
-
- assignment.setSchedulingMode(
- SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
- submitResourceCommitRequest(getClusterResource(), assignment);
- }
-
- private CSAssignment allocateOrReserveNewContainers(
- CandidateNodeSet candidates,
- boolean withNodeHeartbeat) {
- CSAssignment assignment = getRootQueue().assignContainers(
- getClusterResource(), candidates, new ResourceLimits(labelManager
- .getResourceByLabel(candidates.getPartition(),
- getClusterResource())),
- SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
-
- assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
- submitResourceCommitRequest(getClusterResource(), assignment);
-
- if (Resources.greaterThan(calculator, getClusterResource(),
- assignment.getResource(), Resources.none())) {
- FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
- NodeId nodeId = null;
- if (node != null) {
- nodeId = node.getNodeID();
- }
- if (withNodeHeartbeat) {
- updateSchedulerHealth(lastNodeUpdateTime, nodeId, assignment);
- }
- return assignment;
- }
-
- // Only do non-exclusive allocation when node has node-labels.
- if (StringUtils.equals(candidates.getPartition(),
- RMNodeLabelsManager.NO_LABEL)) {
- return null;
- }
-
- // Only do non-exclusive allocation when the node-label supports that
- try {
- if (rmContext.getNodeLabelManager().isExclusiveNodeLabel(
- candidates.getPartition())) {
- return null;
- }
- } catch (IOException e) {
- LOG.warn(
- "Exception when trying to get exclusivity of node label=" + candidates
- .getPartition(), e);
- return null;
- }
-
- // Try to use NON_EXCLUSIVE
- assignment = getRootQueue().assignContainers(getClusterResource(),
- candidates,
- // TODO, now we only consider limits for parent for non-labeled
- // resources, should consider labeled resources as well.
- new ResourceLimits(labelManager
- .getResourceByLabel(RMNodeLabelsManager.NO_LABEL,
- getClusterResource())),
- SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
- assignment.setSchedulingMode(SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
- submitResourceCommitRequest(getClusterResource(), assignment);
-
- return assignment;
- }
-
- /*
- * New behavior, allocate containers considering multiple nodes
- */
- private CSAssignment allocateContainersOnMultiNodes(
- CandidateNodeSet candidates) {
- // When this time look at multiple nodes, try schedule if the
- // partition has any available resource or killable resource
- if (getRootQueue().getQueueCapacities().getUsedCapacity(
- candidates.getPartition()) >= 1.0f
- && preemptionManager.getKillableResource(
- CapacitySchedulerConfiguration.ROOT, candidates.getPartition())
- == Resources.none()) {
- // Try to allocate from reserved containers
- for (FiCaSchedulerNode node : candidates.getAllNodes().values()) {
- RMContainer reservedContainer = node.getReservedContainer();
- if (reservedContainer != null) {
- allocateFromReservedContainer(node, false, reservedContainer);
- }
- }
- LOG.debug("This partition '{}' doesn't have available or "
- + "killable resource", candidates.getPartition());
- ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, null,
- "", getRootQueue().getQueuePath(), ActivityState.REJECTED,
- ActivityDiagnosticConstant.
- INIT_CHECK_PARTITION_RESOURCE_INSUFFICIENT);
- ActivitiesLogger.NODE
- .finishSkippedNodeAllocation(activitiesManager, null);
- return null;
- }
-
- return allocateOrReserveNewContainers(candidates, false);
- }
-
- @VisibleForTesting
- CSAssignment allocateContainersToNode(
- CandidateNodeSet candidates,
- boolean withNodeHeartbeat) {
- if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext
- .isSchedulerReadyForAllocatingContainers()) {
- return null;
- }
-
- long startTime = System.nanoTime();
-
- // Backward compatible way to make sure previous behavior which allocation
- // driven by node heartbeat works.
- FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
-
- // We have two different logics to handle allocation on single node / multi
- // nodes.
- CSAssignment assignment;
- if (!multiNodePlacementEnabled) {
- ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager,
- node.getNodeID());
- assignment = allocateContainerOnSingleNode(candidates,
- node, withNodeHeartbeat);
- ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager,
- node.getNodeID(), candidates.getPartition());
- } else{
- ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager,
- ActivitiesManager.EMPTY_NODE_ID);
- assignment = allocateContainersOnMultiNodes(candidates);
- ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager,
- ActivitiesManager.EMPTY_NODE_ID, candidates.getPartition());
- }
-
- if (assignment != null && assignment.getAssignmentInformation() != null
- && assignment.getAssignmentInformation().getNumAllocations() > 0) {
- long allocateTime = System.nanoTime() - startTime;
- CapacitySchedulerMetrics.getMetrics().addAllocate(allocateTime);
- }
- return assignment;
- }
-
- @Override
- public void handle(SchedulerEvent event) {
- switch(event.getType()) {
- case NODE_ADDED:
- {
- NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent)event;
- addNode(nodeAddedEvent.getAddedRMNode());
- recoverContainersOnNode(nodeAddedEvent.getContainerReports(),
- nodeAddedEvent.getAddedRMNode());
- }
- break;
- case NODE_REMOVED:
- {
- NodeRemovedSchedulerEvent nodeRemovedEvent = (NodeRemovedSchedulerEvent)event;
- removeNode(nodeRemovedEvent.getRemovedRMNode());
- }
- break;
- case NODE_RESOURCE_UPDATE:
- {
- NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent =
- (NodeResourceUpdateSchedulerEvent)event;
- updateNodeAndQueueResource(nodeResourceUpdatedEvent.getRMNode(),
- nodeResourceUpdatedEvent.getResourceOption());
- }
- break;
- case NODE_LABELS_UPDATE:
- {
- NodeLabelsUpdateSchedulerEvent labelUpdateEvent =
- (NodeLabelsUpdateSchedulerEvent) event;
-
- updateNodeLabelsAndQueueResource(labelUpdateEvent);
- }
- break;
- case NODE_ATTRIBUTES_UPDATE:
- {
- NodeAttributesUpdateSchedulerEvent attributeUpdateEvent =
- (NodeAttributesUpdateSchedulerEvent) event;
-
- updateNodeAttributes(attributeUpdateEvent);
- }
- break;
- case NODE_UPDATE:
- {
- NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
- nodeUpdate(nodeUpdatedEvent.getRMNode());
- }
- break;
- case APP_ADDED:
- {
- AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event;
- String queueName = resolveReservationQueueName(appAddedEvent.getQueue(),
- appAddedEvent.getApplicationId(), appAddedEvent.getReservationID(),
- appAddedEvent.getIsAppRecovering());
- if (queueName != null) {
- if (!appAddedEvent.getIsAppRecovering()) {
- addApplication(appAddedEvent.getApplicationId(), queueName,
- appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(),
- appAddedEvent.getPlacementContext());
+ // To avoid too verbose DEBUG logging, only print debug log once for
+ // every 10 secs.
+ boolean printSkipedNodeLogging = false;
+ if (Time.monotonicNow() / 1000 % 10 == 0) {
+ printSkipedNodeLogging = (!printedVerboseLoggingForAsyncScheduling);
} else {
- addApplicationOnRecovery(appAddedEvent.getApplicationId(), queueName,
- appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(),
- appAddedEvent.getPlacementContext());
- }
- }
- }
- break;
- case APP_REMOVED:
- {
- AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event;
- doneApplication(appRemovedEvent.getApplicationID(),
- appRemovedEvent.getFinalState());
- }
- break;
- case APP_ATTEMPT_ADDED:
- {
- AppAttemptAddedSchedulerEvent appAttemptAddedEvent =
- (AppAttemptAddedSchedulerEvent) event;
- addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(),
- appAttemptAddedEvent.getTransferStateFromPreviousAttempt(),
- appAttemptAddedEvent.getIsAttemptRecovering());
- }
- break;
- case APP_ATTEMPT_REMOVED:
- {
- AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent =
- (AppAttemptRemovedSchedulerEvent) event;
- doneApplicationAttempt(appAttemptRemovedEvent.getApplicationAttemptID(),
- appAttemptRemovedEvent.getFinalAttemptState(),
- appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts());
- }
- break;
- case CONTAINER_EXPIRED:
- {
- ContainerExpiredSchedulerEvent containerExpiredEvent =
- (ContainerExpiredSchedulerEvent) event;
- ContainerId containerId = containerExpiredEvent.getContainerId();
- if (containerExpiredEvent.isIncrease()) {
- rollbackContainerUpdate(containerId);
- } else {
- completedContainer(getRMContainer(containerId),
- SchedulerUtils.createAbnormalContainerStatus(
- containerId,
- SchedulerUtils.EXPIRED_CONTAINER),
- RMContainerEventType.EXPIRE);
- }
- }
- break;
- case RELEASE_CONTAINER:
- {
- RMContainer container = ((ReleaseContainerEvent) event).getContainer();
- completedContainer(container,
- SchedulerUtils.createAbnormalContainerStatus(
- container.getContainerId(),
- SchedulerUtils.RELEASED_CONTAINER),
- RMContainerEventType.RELEASED);
- }
- break;
- case KILL_RESERVED_CONTAINER:
- {
- ContainerPreemptEvent killReservedContainerEvent =
- (ContainerPreemptEvent) event;
- RMContainer container = killReservedContainerEvent.getContainer();
- killReservedContainer(container);
- }
- break;
- case MARK_CONTAINER_FOR_PREEMPTION:
- {
- ContainerPreemptEvent preemptContainerEvent =
- (ContainerPreemptEvent)event;
- ApplicationAttemptId aid = preemptContainerEvent.getAppId();
- RMContainer containerToBePreempted = preemptContainerEvent.getContainer();
- markContainerForPreemption(aid, containerToBePreempted);
- }
- break;
- case MARK_CONTAINER_FOR_KILLABLE:
- {
- ContainerPreemptEvent containerKillableEvent = (ContainerPreemptEvent)event;
- RMContainer killableContainer = containerKillableEvent.getContainer();
- markContainerForKillable(killableContainer);
- }
- break;
- case MARK_CONTAINER_FOR_NONKILLABLE:
- {
- if (isLazyPreemptionEnabled) {
- ContainerPreemptEvent cancelKillContainerEvent =
- (ContainerPreemptEvent) event;
- markContainerForNonKillable(cancelKillContainerEvent.getContainer());
- }
- }
- break;
- case MANAGE_QUEUE:
- {
- QueueManagementChangeEvent queueManagementChangeEvent =
- (QueueManagementChangeEvent) event;
- ParentQueue parentQueue = queueManagementChangeEvent.getParentQueue();
- try {
- final List queueManagementChanges =
- queueManagementChangeEvent.getQueueManagementChanges();
- ((ManagedParentQueue) parentQueue)
- .validateAndApplyQueueManagementChanges(queueManagementChanges);
- } catch (SchedulerDynamicEditException sde) {
- LOG.error("Queue Management Change event cannot be applied for "
- + "parent queue : " + parentQueue.getQueuePath(), sde);
- } catch (IOException ioe) {
- LOG.error("Queue Management Change event cannot be applied for "
- + "parent queue : " + parentQueue.getQueuePath(), ioe);
- }
- }
- break;
- default:
- LOG.error("Invalid eventtype " + event.getType() + ". Ignoring!");
- }
- }
-
- private void updateNodeAttributes(
- NodeAttributesUpdateSchedulerEvent attributeUpdateEvent) {
- writeLock.lock();
- try {
- for (Entry> entry : attributeUpdateEvent
- .getUpdatedNodeToAttributes().entrySet()) {
- String hostname = entry.getKey();
- Set attributes = entry.getValue();
- List nodeIds = nodeTracker.getNodeIdsByResourceName(hostname);
- updateAttributesOnNode(nodeIds, attributes);
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- private void updateAttributesOnNode(List nodeIds,
- Set attributes) {
- nodeIds.forEach((k) -> {
- SchedulerNode node = nodeTracker.getNode(k);
- node.updateNodeAttributes(attributes);
- });
- }
-
- /**
- * Process node labels update.
- */
- private void updateNodeLabelsAndQueueResource(
- NodeLabelsUpdateSchedulerEvent labelUpdateEvent) {
- writeLock.lock();
- try {
- Set updateLabels = new HashSet();
- for (Entry> entry : labelUpdateEvent
- .getUpdatedNodeToLabels().entrySet()) {
- NodeId id = entry.getKey();
- Set labels = entry.getValue();
- FiCaSchedulerNode node = nodeTracker.getNode(id);
-
- if (node != null) {
- // Update old partition to list.
- updateLabels.add(node.getPartition());
- }
- updateLabelsOnNode(id, labels);
- updateLabels.addAll(labels);
- }
- refreshLabelToNodeCache(updateLabels);
- Resource clusterResource = getClusterResource();
- getRootQueue().updateClusterResource(clusterResource,
- new ResourceLimits(clusterResource));
- } finally {
- writeLock.unlock();
- }
- }
-
- private void refreshLabelToNodeCache(Set updateLabels) {
- Map> labelMapping = labelManager
- .getLabelsToNodes(updateLabels);
- for (String label : updateLabels) {
- Set nodes = labelMapping.get(label);
- if (nodes == null) {
- continue;
- }
- nodeTracker.updateNodesPerPartition(label, nodes);
- }
- }
-
- /**
- * Add node to nodeTracker. Used when validating CS configuration by instantiating a new
- * CS instance.
- * @param nodesToAdd node to be added
- */
- public void addNodes(List nodesToAdd) {
- writeLock.lock();
- try {
- for (FiCaSchedulerNode node : nodesToAdd) {
- nodeTracker.addNode(node);
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- private void addNode(RMNode nodeManager) {
- writeLock.lock();
- try {
- FiCaSchedulerNode schedulerNode = new FiCaSchedulerNode(nodeManager,
- usePortForNodeName, nodeManager.getNodeLabels());
- nodeTracker.addNode(schedulerNode);
-
- // update this node to node label manager
- if (labelManager != null) {
- labelManager.activateNode(nodeManager.getNodeID(),
- schedulerNode.getTotalResource());
- }
-
- // recover attributes from store if any.
- if (rmContext.getNodeAttributesManager() != null) {
- rmContext.getNodeAttributesManager()
- .refreshNodeAttributesToScheduler(schedulerNode.getNodeID());
- }
-
- Resource clusterResource = getClusterResource();
- getRootQueue().updateClusterResource(clusterResource,
- new ResourceLimits(clusterResource));
-
- LOG.info(
- "Added node " + nodeManager.getNodeAddress() + " clusterResource: "
- + clusterResource);
-
- if (scheduleAsynchronously && getNumClusterNodes() == 1) {
- for (AsyncScheduleThread t : asyncSchedulerThreads) {
- t.beginSchedule();
- }
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- private void removeNode(RMNode nodeInfo) {
- writeLock.lock();
- try {
- // update this node to node label manager
- if (labelManager != null) {
- labelManager.deactivateNode(nodeInfo.getNodeID());
- }
-
- NodeId nodeId = nodeInfo.getNodeID();
- FiCaSchedulerNode node = nodeTracker.getNode(nodeId);
- if (node == null) {
- LOG.error("Attempting to remove non-existent node " + nodeId);
- return;
- }
-
- // Remove running containers
- List runningContainers =
- node.getCopiedListOfRunningContainers();
- for (RMContainer container : runningContainers) {
- super.completedContainer(container, SchedulerUtils
- .createAbnormalContainerStatus(container.getContainerId(),
- SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL);
- node.releaseContainer(container.getContainerId(), true);
- }
-
- // Remove reservations, if any
- RMContainer reservedContainer = node.getReservedContainer();
- if (reservedContainer != null) {
- super.completedContainer(reservedContainer, SchedulerUtils
- .createAbnormalContainerStatus(reservedContainer.getContainerId(),
- SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL);
- }
-
- nodeTracker.removeNode(nodeId);
- Resource clusterResource = getClusterResource();
- getRootQueue().updateClusterResource(clusterResource,
- new ResourceLimits(clusterResource));
- int numNodes = nodeTracker.nodeCount();
-
- if (scheduleAsynchronously && numNodes == 0) {
- for (AsyncScheduleThread t : asyncSchedulerThreads) {
- t.suspendSchedule();
- }
- }
-
- LOG.info(
- "Removed node " + nodeInfo.getNodeAddress() + " clusterResource: "
- + getClusterResource());
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- protected void completedContainerInternal(
- RMContainer rmContainer, ContainerStatus containerStatus,
- RMContainerEventType event) {
- Container container = rmContainer.getContainer();
- ContainerId containerId = container.getId();
-
- // Get the application for the finished container
- FiCaSchedulerApp application = getCurrentAttemptForContainer(
- container.getId());
- ApplicationId appId =
- containerId.getApplicationAttemptId().getApplicationId();
- if (application == null) {
- LOG.info(
- "Container " + container + " of" + " finished application " + appId
- + " completed with event " + event);
- return;
- }
-
- // Get the node on which the container was allocated
- FiCaSchedulerNode node = getNode(container.getNodeId());
- if (null == node) {
- LOG.info("Container " + container + " of" + " removed node " + container
- .getNodeId() + " completed with event " + event);
- return;
- }
-
- // Inform the queue
- LeafQueue queue = (LeafQueue) application.getQueue();
- queue.completedContainer(getClusterResource(), application, node,
- rmContainer, containerStatus, event, null, true);
- }
-
- @Lock(Lock.NoLock.class)
- @VisibleForTesting
- @Override
- public FiCaSchedulerApp getApplicationAttempt(
- ApplicationAttemptId applicationAttemptId) {
- return super.getApplicationAttempt(applicationAttemptId);
- }
-
- @Lock(Lock.NoLock.class)
- public FiCaSchedulerNode getNode(NodeId nodeId) {
- return nodeTracker.getNode(nodeId);
- }
-
- @Lock(Lock.NoLock.class)
- public List getAllNodes() {
- return nodeTracker.getAllNodes();
- }
-
- @Override
- @Lock(Lock.NoLock.class)
- public void recover(RMState state) throws Exception {
- // NOT IMPLEMENTED
- }
-
- @Override
- public void killReservedContainer(RMContainer container) {
- LOG.debug("{}:{}", SchedulerEventType.KILL_RESERVED_CONTAINER, container);
-
- // To think: What happens if this is no longer a reserved container, for
- // e.g if the reservation became an allocation.
- super.completedContainer(container,
- SchedulerUtils.createAbnormalContainerStatus(
- container.getContainerId(),
- SchedulerUtils.UNRESERVED_CONTAINER),
- RMContainerEventType.KILL);
- }
-
- @Override
- public void markContainerForPreemption(ApplicationAttemptId aid,
- RMContainer cont) {
- LOG.debug("{}: appAttempt:{} container:{}",
- SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION, aid, cont);
- FiCaSchedulerApp app = getApplicationAttempt(aid);
- if (app != null) {
- app.markContainerForPreemption(cont.getContainerId());
- }
- }
-
- @VisibleForTesting
- @Override
- public void killContainer(RMContainer container) {
- markContainerForKillable(container);
- }
-
- public void markContainerForKillable(
- RMContainer killableContainer) {
- writeLock.lock();
- try {
- LOG.debug("{}: container {}",
- SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE, killableContainer);
-
- if (!isLazyPreemptionEnabled) {
- super.completedContainer(killableContainer, SchedulerUtils
- .createPreemptedContainerStatus(killableContainer.getContainerId(),
- SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL);
- } else {
- FiCaSchedulerNode node = getSchedulerNode(
- killableContainer.getAllocatedNode());
+ printedVerboseLoggingForAsyncScheduling = false;
+ }
- FiCaSchedulerApp application = getCurrentAttemptForContainer(
- killableContainer.getContainerId());
+ // Allocate containers of node [start, end)
+ for (FiCaSchedulerNode node : nodes) {
+ if (current++ >= start) {
+ if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
+ continue;
+ }
+ cs.allocateContainersToNode(node.getNodeID(), false);
+ }
+ }
- node.markContainerToKillable(killableContainer.getContainerId());
+ current = 0;
- // notify PreemptionManager
- // Get the application for the finished container
- if (null != application) {
- String leafQueuePath = application.getCSLeafQueue().getQueuePath();
- getPreemptionManager().addKillableContainer(
- new KillableContainer(killableContainer, node.getPartition(),
- leafQueuePath));
- }
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- private void markContainerForNonKillable(
- RMContainer nonKillableContainer) {
- writeLock.lock();
- try {
- LOG.debug("{}: container {}", SchedulerEventType.
- MARK_CONTAINER_FOR_NONKILLABLE, nonKillableContainer);
-
- FiCaSchedulerNode node = getSchedulerNode(
- nonKillableContainer.getAllocatedNode());
-
- FiCaSchedulerApp application = getCurrentAttemptForContainer(
- nonKillableContainer.getContainerId());
-
- node.markContainerToNonKillable(nonKillableContainer.getContainerId());
-
- // notify PreemptionManager
- // Get the application for the finished container
- if (null != application) {
- String leafQueuePath = application.getCSLeafQueue().getQueuePath();
- getPreemptionManager().removeKillableContainer(
- new KillableContainer(nonKillableContainer, node.getPartition(),
- leafQueuePath));
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public boolean checkAccess(UserGroupInformation callerUGI,
- QueueACL acl, String queueName) {
- CSQueue queue = getQueue(queueName);
-
- if (queueName.startsWith("root.")) {
- // can only check proper ACLs if the path is fully qualified
- while (queue == null) {
- int sepIndex = queueName.lastIndexOf(".");
- String parentName = queueName.substring(0, sepIndex);
- if (LOG.isDebugEnabled()) {
- LOG.debug("Queue {} does not exist, checking parent {}",
- queueName, parentName);
- }
- queueName = parentName;
- queue = queueManager.getQueue(queueName);
- }
- }
-
- if (queue == null) {
- LOG.debug("ACL not found for queue access-type {} for queue {}",
- acl, queueName);
- return false;
- }
- return queue.hasAccess(acl, callerUGI);
- }
-
- @Override
- public List getAppsInQueue(String queueName) {
- CSQueue queue = getQueue(queueName);
- if (queue == null) {
- return null;
- }
- List apps = new ArrayList();
- queue.collectSchedulerApplications(apps);
- return apps;
- }
-
- public boolean isSystemAppsLimitReached() {
- if (getRootQueue().getNumApplications() < conf
- .getMaximumSystemApplications()) {
- return false;
- }
- return true;
- }
-
- private String getDefaultReservationQueueName(String planQueueName) {
- return planQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX;
- }
-
- private String resolveReservationQueueName(String queueName,
- ApplicationId applicationId, ReservationId reservationID,
- boolean isRecovering) {
- readLock.lock();
- try {
- CSQueue queue = getQueue(queueName);
- // Check if the queue is a plan queue
- if ((queue == null) || !(queue instanceof PlanQueue)) {
- return queueName;
- }
- if (reservationID != null) {
- String resQName = reservationID.toString();
- queue = getQueue(resQName);
- if (queue == null) {
- // reservation has terminated during failover
- if (isRecovering && conf.getMoveOnExpiry(
- getQueue(queueName).getQueuePath())) {
- // move to the default child queue of the plan
- return getDefaultReservationQueueName(queueName);
- }
- String message = "Application " + applicationId
- + " submitted to a reservation which is not currently active: "
- + resQName;
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return null;
- }
- if (!queue.getParent().getQueuePath().equals(queueName)) {
- String message =
- "Application: " + applicationId + " submitted to a reservation "
- + resQName + " which does not belong to the specified queue: "
- + queueName;
- this.rmContext.getDispatcher().getEventHandler().handle(
- new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
- message));
- return null;
- }
- // use the reservation queue to run the app
- queueName = resQName;
- } else{
- // use the default child queue of the plan for unreserved apps
- queueName = getDefaultReservationQueueName(queueName);
- }
- return queueName;
- } finally {
- readLock.unlock();
- }
-
- }
-
- @Override
- public void removeQueue(String queueName)
- throws SchedulerDynamicEditException {
- writeLock.lock();
- try {
- LOG.info("Removing queue: " + queueName);
- CSQueue q = this.getQueue(queueName);
- if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom(
- q.getClass()))) {
- throw new SchedulerDynamicEditException(
- "The queue that we are asked " + "to remove (" + queueName
- + ") is not a AutoCreatedLeafQueue or ReservationQueue");
- }
- AbstractAutoCreatedLeafQueue disposableLeafQueue =
- (AbstractAutoCreatedLeafQueue) q;
- // at this point we should have no more apps
- if (disposableLeafQueue.getNumApplications() > 0) {
- throw new SchedulerDynamicEditException(
- "The queue " + queueName + " is not empty " + disposableLeafQueue
- .getApplications().size() + " active apps "
- + disposableLeafQueue.getPendingApplications().size()
- + " pending apps");
- }
-
- ((AbstractManagedParentQueue) disposableLeafQueue.getParent())
- .removeChildQueue(q);
- this.queueManager.removeQueue(queueName);
- LOG.info(
- "Removal of AutoCreatedLeafQueue " + queueName + " has succeeded");
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public void addQueue(Queue queue)
- throws SchedulerDynamicEditException, IOException {
- writeLock.lock();
- try {
- if (queue == null) {
- throw new SchedulerDynamicEditException(
- "Queue specified is null. Should be an implementation of "
- + "AbstractAutoCreatedLeafQueue");
- } else if (!(AbstractAutoCreatedLeafQueue.class
- .isAssignableFrom(queue.getClass()))) {
- throw new SchedulerDynamicEditException(
- "Queue is not an implementation of "
- + "AbstractAutoCreatedLeafQueue : " + queue.getClass());
- }
-
- AbstractAutoCreatedLeafQueue newQueue =
- (AbstractAutoCreatedLeafQueue) queue;
-
- if (newQueue.getParent() == null || !(AbstractManagedParentQueue.class.
- isAssignableFrom(newQueue.getParent().getClass()))) {
- throw new SchedulerDynamicEditException(
- "ParentQueue for " + newQueue + " is not properly set"
- + " (should be set and be a PlanQueue or ManagedParentQueue)");
- }
-
- AbstractManagedParentQueue parent =
- (AbstractManagedParentQueue) newQueue.getParent();
- String queuePath = newQueue.getQueuePath();
- parent.addChildQueue(newQueue);
- this.queueManager.addQueue(queuePath, newQueue);
-
- LOG.info("Creation of AutoCreatedLeafQueue " + newQueue + " succeeded");
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public void setEntitlement(String inQueue, QueueEntitlement entitlement)
- throws YarnException {
- writeLock.lock();
- try {
- LeafQueue queue = this.queueManager.getAndCheckLeafQueue(inQueue);
- AbstractManagedParentQueue parent =
- (AbstractManagedParentQueue) queue.getParent();
-
- if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom(
- queue.getClass()))) {
- throw new SchedulerDynamicEditException(
- "Entitlement can not be" + " modified dynamically since queue "
- + inQueue + " is not a AutoCreatedLeafQueue");
- }
-
- if (parent == null || !(AbstractManagedParentQueue.class.isAssignableFrom(
- parent.getClass()))) {
- throw new SchedulerDynamicEditException(
- "The parent of AutoCreatedLeafQueue " + inQueue
- + " must be a PlanQueue/ManagedParentQueue");
- }
-
- AbstractAutoCreatedLeafQueue newQueue =
- (AbstractAutoCreatedLeafQueue) queue;
- parent.validateQueueEntitlementChange(newQueue, entitlement);
-
- newQueue.setEntitlement(entitlement);
-
- LOG.info("Set entitlement for AutoCreatedLeafQueue " + inQueue + " to "
- + queue.getCapacity() + " request was (" + entitlement.getCapacity()
- + ")");
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public String moveApplication(ApplicationId appId,
- String targetQueueName) throws YarnException {
- writeLock.lock();
- try {
- SchedulerApplication application =
- applications.get(appId);
- if (application == null) {
- throw new YarnException("App to be moved " + appId + " not found.");
- }
- String sourceQueueName = application.getQueue().getQueueName();
- LeafQueue source =
- this.queueManager.getAndCheckLeafQueue(sourceQueueName);
- String destQueueName = handleMoveToPlanQueue(targetQueueName);
- LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
-
- String user = application.getUser();
- try {
- dest.submitApplication(appId, user, destQueueName);
- } catch (AccessControlException e) {
- throw new YarnException(e);
- }
-
- FiCaSchedulerApp app = application.getCurrentAppAttempt();
- if (app != null) {
- // Move all live containers even when stopped.
- // For transferStateFromPreviousAttempt required
- for (RMContainer rmContainer : app.getLiveContainers()) {
- source.detachContainer(getClusterResource(), app, rmContainer);
- // attach the Container to another queue
- dest.attachContainer(getClusterResource(), app, rmContainer);
- }
- // Move all reserved containers
- for (RMContainer rmContainer : app.getReservedContainers()) {
- source.detachContainer(getClusterResource(), app, rmContainer);
- dest.attachContainer(getClusterResource(), app, rmContainer);
- }
- if (!app.isStopped()) {
- source.finishApplicationAttempt(app, sourceQueueName);
- // Submit to a new queue
- dest.submitApplicationAttempt(app, user, true);
- }
- // Finish app & update metrics
- app.move(dest);
- }
- source.appFinished();
- // Detach the application..
- source.getParent().finishApplication(appId, user);
- application.setQueue(dest);
- LOG.info("App: " + appId + " successfully moved from " + sourceQueueName
- + " to: " + destQueueName);
- return targetQueueName;
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public void preValidateMoveApplication(ApplicationId appId,
- String newQueue) throws YarnException {
- writeLock.lock();
- try {
- SchedulerApplication application =
- applications.get(appId);
- if (application == null) {
- throw new YarnException("App to be moved " + appId + " not found.");
- }
- Queue queue = application.getQueue();
- String sourceQueueName = queue instanceof CSQueue ?
- ((CSQueue) queue).getQueuePath() : queue.getQueueName();
- this.queueManager.getAndCheckLeafQueue(sourceQueueName);
- String destQueueName = handleMoveToPlanQueue(newQueue);
- LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
- // Validation check - ACLs, submission limits for user & queue
- String user = application.getUser();
- // Check active partition only when attempt is available
- FiCaSchedulerApp appAttempt =
- getApplicationAttempt(ApplicationAttemptId.newInstance(appId, 0));
- if (null != appAttempt) {
- checkQueuePartition(appAttempt, dest);
- }
- try {
- dest.validateSubmitApplication(appId, user, destQueueName);
- } catch (AccessControlException e) {
- throw new YarnException(e);
- }
- } finally {
- writeLock.unlock();
- }
- }
-
- /**
- * Check application can be moved to queue with labels enabled. All labels in
- * application life time will be checked
- *
- * @param app
- * @param dest
- * @throws YarnException
- */
- private void checkQueuePartition(FiCaSchedulerApp app, LeafQueue dest)
- throws YarnException {
- if (!YarnConfiguration.areNodeLabelsEnabled(conf)) {
- return;
- }
- Set targetqueuelabels = dest.getAccessibleNodeLabels();
- AppSchedulingInfo schedulingInfo = app.getAppSchedulingInfo();
- Set appLabelexpressions = schedulingInfo.getRequestedPartitions();
- // default partition access always available remove empty label
- appLabelexpressions.remove(RMNodeLabelsManager.NO_LABEL);
- Set nonAccessiblelabels = new HashSet();
- for (String label : appLabelexpressions) {
- if (!SchedulerUtils.checkQueueLabelExpression(targetqueuelabels, label,
- null)) {
- nonAccessiblelabels.add(label);
- }
- }
- if (nonAccessiblelabels.size() > 0) {
- throw new YarnException(
- "Specified queue=" + dest.getQueuePath() + " can't satisfy following "
- + "apps label expressions =" + nonAccessiblelabels
- + " accessible node labels =" + targetqueuelabels);
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public EnumSet getSchedulingResourceTypes() {
- if (calculator.getClass().getName()
- .equals(DefaultResourceCalculator.class.getName())) {
- return EnumSet.of(SchedulerResourceTypes.MEMORY);
- }
- return EnumSet.of(SchedulerResourceTypes.MEMORY, SchedulerResourceTypes.CPU);
- }
-
- @Override
- public Resource getMaximumResourceCapability(String queueName) {
- if(queueName == null || queueName.isEmpty()) {
- return getMaximumResourceCapability();
- }
- CSQueue queue = getQueue(queueName);
- if (queue == null) {
- if (isAmbiguous(queueName)) {
- LOG.error("Ambiguous queue reference: " + queueName
- + " please use full queue path instead.");
- } else {
- LOG.error("Unknown queue: " + queueName);
- }
- return getMaximumResourceCapability();
- }
- if (!(queue instanceof LeafQueue)) {
- LOG.error("queue " + queueName + " is not an leaf queue");
- return getMaximumResourceCapability();
- }
-
- // queue.getMaxAllocation returns *configured* maximum allocation.
- // getMaximumResourceCapability() returns maximum allocation considers
- // per-node maximum resources. So return (component-wise) min of the two.
-
- Resource queueMaxAllocation = ((LeafQueue)queue).getMaximumAllocation();
- Resource clusterMaxAllocationConsiderNodeMax =
- getMaximumResourceCapability();
-
- return Resources.componentwiseMin(queueMaxAllocation,
- clusterMaxAllocationConsiderNodeMax);
- }
-
- private String handleMoveToPlanQueue(String targetQueueName) {
- CSQueue dest = getQueue(targetQueueName);
- if (dest != null && dest instanceof PlanQueue) {
- // use the default child reservation queue of the plan
- targetQueueName = targetQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX;
- }
- return targetQueueName;
- }
-
- @Override
- public Set getPlanQueues() {
- Set ret = new HashSet();
- for (Entry l : queueManager.getQueues().entrySet()) {
- if (l.getValue() instanceof PlanQueue) {
- ret.add(l.getKey());
- }
- }
- return ret;
- }
-
- @Override
- public Priority checkAndGetApplicationPriority(
- Priority priorityRequestedByApp, UserGroupInformation user,
- String queuePath, ApplicationId applicationId) throws YarnException {
- readLock.lock();
- try {
- Priority appPriority = priorityRequestedByApp;
-
- // Verify the scenario where priority is null from submissionContext.
- if (null == appPriority) {
- // Verify whether submitted user has any default priority set. If so,
- // user's default priority will get precedence over queue default.
- // for updateApplicationPriority call flow, this check is done in
- // CientRMService itself.
- appPriority = this.appPriorityACLManager.getDefaultPriority(
- normalizeQueueName(queuePath),
- user);
-
- // Get the default priority for the Queue. If Queue is non-existent,
- // then
- // use default priority. Do it only if user doesn't have any default.
- if (null == appPriority) {
- appPriority = this.queueManager.getDefaultPriorityForQueue(
- normalizeQueueName(queuePath));
- }
-
- LOG.info(
- "Application '" + applicationId + "' is submitted without priority "
- + "hence considering default queue/cluster priority: "
- + appPriority.getPriority());
- }
-
- // Verify whether submitted priority is lesser than max priority
- // in the cluster. If it is out of found, defining a max cap.
- if (appPriority.getPriority() > getMaxClusterLevelAppPriority()
- .getPriority()) {
- appPriority = Priority
- .newInstance(getMaxClusterLevelAppPriority().getPriority());
- }
-
- // Lets check for ACLs here.
- if (!appPriorityACLManager.checkAccess(user, normalizeQueueName(queuePath), appPriority)) {
- throw new YarnException(new AccessControlException(
- "User " + user + " does not have permission to submit/update "
- + applicationId + " for " + appPriority));
- }
-
- LOG.info("Priority '" + appPriority.getPriority()
- + "' is acceptable in queue : " + queuePath + " for application: "
- + applicationId);
-
- return appPriority;
- } finally {
- readLock.unlock();
- }
- }
-
- @Override
- public Priority updateApplicationPriority(Priority newPriority,
- ApplicationId applicationId, SettableFuture future,
- UserGroupInformation user)
- throws YarnException {
- writeLock.lock();
- try {
- Priority appPriority = null;
- SchedulerApplication application = applications
- .get(applicationId);
-
- if (application == null) {
- throw new YarnException("Application '" + applicationId
- + "' is not present, hence could not change priority.");
- }
-
- RMApp rmApp = rmContext.getRMApps().get(applicationId);
-
- appPriority = checkAndGetApplicationPriority(newPriority, user,
- rmApp.getQueue(), applicationId);
-
- if (application.getPriority().equals(appPriority)) {
- future.set(null);
- return appPriority;
- }
-
- // Update new priority in Submission Context to update to StateStore.
- rmApp.getApplicationSubmissionContext().setPriority(appPriority);
-
- // Update to state store
- ApplicationStateData appState = ApplicationStateData.newInstance(
- rmApp.getSubmitTime(), rmApp.getStartTime(),
- rmApp.getApplicationSubmissionContext(), rmApp.getUser(),
- rmApp.getRealUser(), rmApp.getCallerContext());
- appState.setApplicationTimeouts(rmApp.getApplicationTimeouts());
- appState.setLaunchTime(rmApp.getLaunchTime());
- rmContext.getStateStore().updateApplicationStateSynchronously(appState,
- false, future);
-
- // As we use iterator over a TreeSet for OrderingPolicy, once we change
- // priority then reinsert back to make order correct.
- LeafQueue queue = (LeafQueue) getQueue(rmApp.getQueue());
- queue.updateApplicationPriority(application, appPriority);
-
- LOG.info("Priority '" + appPriority + "' is updated in queue :"
- + rmApp.getQueue() + " for application: " + applicationId
- + " for the user: " + rmApp.getUser());
- return appPriority;
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public PreemptionManager getPreemptionManager() {
- return preemptionManager;
- }
-
- @Override
- public ResourceUsage getClusterResourceUsage() {
- return getRootQueue().getQueueResourceUsage();
- }
-
- private SchedulerContainer getSchedulerContainer(
- RMContainer rmContainer, boolean allocated) {
- if (null == rmContainer) {
- return null;
- }
-
- FiCaSchedulerApp app = getApplicationAttempt(
- rmContainer.getApplicationAttemptId());
- if (null == app) { return null; }
-
- NodeId nodeId;
- // Get nodeId
- if (rmContainer.getState() == RMContainerState.RESERVED) {
- nodeId = rmContainer.getReservedNode();
- } else {
- nodeId = rmContainer.getNodeId();
- }
-
- FiCaSchedulerNode node = getNode(nodeId);
- if (null == node) {
- return null;
- }
- return new SchedulerContainer<>(app, node, rmContainer,
- // TODO, node partition should come from CSAssignment to avoid partition
- // get updated before submitting the commit
- node.getPartition(), allocated);
- }
-
- private List>
- getSchedulerContainersToRelease(
- CSAssignment csAssignment) {
- List> list = null;
-
- if (csAssignment.getContainersToKill() != null && !csAssignment
- .getContainersToKill().isEmpty()) {
- list = new ArrayList<>();
- for (RMContainer rmContainer : csAssignment.getContainersToKill()) {
- SchedulerContainer schedulerContainer =
- getSchedulerContainer(rmContainer, false);
- if (schedulerContainer != null) {
- list.add(schedulerContainer);
- }
- }
- }
-
- if (csAssignment.getExcessReservation() != null) {
- if (null == list) {
- list = new ArrayList<>();
- }
- SchedulerContainer schedulerContainer =
- getSchedulerContainer(csAssignment.getExcessReservation(), false);
- if (schedulerContainer != null) {
- list.add(schedulerContainer);
- }
- }
-
- if (list != null && list.isEmpty()) {
- list = null;
- }
- return list;
- }
-
- @VisibleForTesting
- public void submitResourceCommitRequest(Resource cluster,
- CSAssignment csAssignment) {
- ResourceCommitRequest request =
- createResourceCommitRequest(csAssignment);
-
- if (null == request) {
- return;
- }
-
- if (scheduleAsynchronously) {
- // Submit to a commit thread and commit it async-ly
- resourceCommitterService.addNewCommitRequest(request);
- } else{
- // Otherwise do it sync-ly.
- tryCommit(cluster, request, true);
- }
- }
-
- @Override
- public boolean attemptAllocationOnNode(SchedulerApplicationAttempt appAttempt,
- SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) {
- if (schedulingRequest.getResourceSizing() != null) {
- if (schedulingRequest.getResourceSizing().getNumAllocations() > 1) {
- LOG.warn("The SchedulingRequest has requested more than 1 allocation," +
- " but only 1 will be attempted !!");
- }
- if (!appAttempt.isStopped()) {
- ResourceCommitRequest
- resourceCommitRequest = createResourceCommitRequest(
- appAttempt, schedulingRequest, schedulerNode);
-
- // Validate placement constraint is satisfied before
- // committing the request.
- try {
- if (!PlacementConstraintsUtil.canSatisfyConstraints(
- appAttempt.getApplicationId(),
- schedulingRequest, schedulerNode,
- rmContext.getPlacementConstraintManager(),
- rmContext.getAllocationTagsManager())) {
- LOG.info("Failed to allocate container for application "
- + appAttempt.getApplicationId() + " on node "
- + schedulerNode.getNodeName()
- + " because this allocation violates the"
- + " placement constraint.");
+ // Allocate containers of node [0, start)
+ for (FiCaSchedulerNode node : nodes) {
+ if (current++ > start) {
+ break;
+ }
+ if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
+ continue;
+ }
+ cs.allocateContainersToNode(node.getNodeID(), false);
+ }
+
+ if (printSkipedNodeLogging) {
+ printedVerboseLoggingForAsyncScheduling = true;
+ }
+
+ Thread.sleep(cs.getAsyncScheduleInterval());
+ }
+
+ static class AsyncScheduleThread extends Thread {
+
+ private final CapacityScheduler cs;
+ private AtomicBoolean runSchedules = new AtomicBoolean(false);
+
+ public AsyncScheduleThread(CapacityScheduler cs) {
+ this.cs = cs;
+ setDaemon(true);
+ }
+
+ @Override
+ public void run() {
+ int debuggingLogCounter = 0;
+ while (!Thread.currentThread().isInterrupted()) {
+ try {
+ if (!runSchedules.get()) {
+ Thread.sleep(100);
+ } else {
+ // Don't run schedule if we have some pending backlogs already
+ if (cs.getAsyncSchedulingPendingBacklogs()
+ > cs.asyncMaxPendingBacklogs) {
+ Thread.sleep(1);
+ } else {
+ List sortedNodesHost = cs.loadsMetricServerRequestThread.getSortedNodeHosts();
+ if (sortedNodesHost.size() > 0) {
+ scheduleLoadBased(cs, sortedNodesHost);
+ } else {
+ schedule(cs);
+ }
+ if (LOG.isDebugEnabled()) {
+ // Adding a debug log here to ensure that the thread is alive
+ // and running fine.
+ if (debuggingLogCounter++ > 10000) {
+ debuggingLogCounter = 0;
+ LOG.debug("AsyncScheduleThread[" + getName() + "] is running!");
+ }
+ }
+ }
+ }
+ } catch (InterruptedException ie) {
+ // keep interrupt signal
+ Thread.currentThread().interrupt();
+ }
+ }
+ LOG.info("AsyncScheduleThread[" + getName() + "] exited!");
+ }
+
+ public void beginSchedule() {
+ runSchedules.set(true);
+ }
+
+ public void suspendSchedule() {
+ runSchedules.set(false);
+ }
+
+ }
+
+ static class ResourceCommitterService extends Thread {
+ private final CapacityScheduler cs;
+ private BlockingQueue>
+ backlogs = new LinkedBlockingQueue<>();
+
+ public ResourceCommitterService(CapacityScheduler cs) {
+ this.cs = cs;
+ setDaemon(true);
+ }
+
+ @Override
+ public void run() {
+ while (!Thread.currentThread().isInterrupted()) {
+ try {
+ ResourceCommitRequest request =
+ backlogs.take();
+ cs.writeLock.lock();
+ try {
+ cs.tryCommit(cs.getClusterResource(), request, true);
+ } finally {
+ cs.writeLock.unlock();
+ }
+
+ } catch (InterruptedException e) {
+ LOG.error(e.toString());
+ Thread.currentThread().interrupt();
+ }
+ }
+ LOG.info("ResourceCommitterService exited!");
+ }
+
+ public void addNewCommitRequest(
+ ResourceCommitRequest proposal) {
+ backlogs.add(proposal);
+ }
+
+ public int getPendingBacklogs() {
+ return backlogs.size();
+ }
+ }
+
+ public static class LoadsMetricServerRequestThread extends Thread {
+ private final CapacityScheduler cs;
+ private List sortedNodeHosts = new ArrayList<>();
+ private Set overLoadNodeHosts = new HashSet<>();
+ private long expirationTime;
+ private static final long DEFAULT_EXPIRED_TIME = 0L;
+
+ private final ReentrantReadWriteLock.ReadLock readLock;
+
+ private final ReentrantReadWriteLock.WriteLock writeLock;
+
+ private static final int GET_STATUS_CODE_OK = 200;
+ private static final String SORTED_NODE_HOSTS = "sorted_node_hosts";
+ private static final String OVERLOAD_NODE_HOSTS = "overload_node_hosts";
+ private static final String EXPIRATION_TIME = "expiration_time";
+ private static final String COMMA = ",";
+
+ public LoadsMetricServerRequestThread(CapacityScheduler cs) {
+ this.cs = cs;
+ setDaemon(true);
+ ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
+ this.readLock = lock.readLock();
+ this.writeLock = lock.writeLock();
+ }
+
+ public void uploadNodeResource(String hostName, Resource allocatedResource, Resource totalResource) {
+ ObjectMapper objectMapper = new ObjectMapper();
+ Map nodeResource = new HashMap<>();
+ nodeResource.put("host_name", hostName);
+ nodeResource.put("allocated_mem", allocatedResource.getResourceInformation(0).getValue());
+ nodeResource.put("total_mem", totalResource.getResourceInformation(0).getValue());
+ nodeResource.put("allocated_vcores", allocatedResource.getResourceInformation(1).getValue());
+ nodeResource.put("total_vcores", totalResource.getResourceInformation(1).getValue());
+
+ try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+ // node loads request
+ HttpPost httpPost = new HttpPost(cs.loadsMetricServerUploadNodeResourceUrl);
+ httpPost.setHeader("Content-Type", "application/json");
+ StringEntity requestEntity = new StringEntity(
+ objectMapper.writeValueAsString(nodeResource), StandardCharsets.UTF_8);
+ httpPost.setEntity(requestEntity);
+
+ try (CloseableHttpResponse response = httpClient.execute(httpPost)) {
+ int statusCode = response.getStatusLine().getStatusCode();
+ if (statusCode != GET_STATUS_CODE_OK) {
+ LOG.error("request {} failed with status code:{}", cs.loadsMetricServerUploadNodeResourceUrl, statusCode);
+ }
+ }
+ } catch (IOException e) {
+ LOG.error("request {} failed for {}", cs.loadsMetricServerUploadNodeResourceUrl, e.getMessage());
+ }
+ }
+
+ @Override
+ public void run() {
+ while (true) {
+ try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+ // node loads request
+ HttpGet httpGet = new HttpGet(cs.loadsMetricServerSortUrl);
+ try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
+ int statusCode = response.getStatusLine().getStatusCode();
+ if (statusCode == GET_STATUS_CODE_OK) {
+ HttpEntity entity = response.getEntity();
+ if (entity != null) {
+ JSONObject jsonObject = new JSONObject(EntityUtils.toString(entity));
+ // update stat
+ update(jsonObject);
+ } else {
+ LOG.warn("request {} with empty response", cs.loadsMetricServerSortUrl);
+ }
+ } else {
+ LOG.error("request {} failed with status code:{}", cs.loadsMetricServerSortUrl, statusCode);
+ }
+ }
+ } catch (IOException | JSONException e) {
+ LOG.error("request {} failed for {}", cs.loadsMetricServerSortUrl, e.getMessage());
+ } finally {
+ try {
+ Thread.sleep(cs.loadsMetricServerRequestInterval);
+ } catch (InterruptedException e) {
+ LOG.error("sleep failed for ", e);
+ }
+ }
+ }
+ }
+
+ private void update(JSONObject jsonObject) throws JSONException {
+ try {
+ // lock before update
+ writeLock.lock();
+ expirationTime = DEFAULT_EXPIRED_TIME;
+ // update overLoadNodesHost
+ overLoadNodeHosts = new HashSet<>();
+ if (!StringUtils.isBlank(jsonObject.getString(OVERLOAD_NODE_HOSTS))) {
+ overLoadNodeHosts.addAll(Arrays.asList(jsonObject.getString(OVERLOAD_NODE_HOSTS).split(COMMA)));
+ }
+ if (!overLoadNodeHosts.isEmpty()) {
+ LOG.warn("overLoadNodeHosts:{}", overLoadNodeHosts);
+ }
+ // update sortedNodesHost
+ sortedNodeHosts = Arrays.stream(jsonObject.getString(SORTED_NODE_HOSTS).split(COMMA))
+ .collect(Collectors.toList());
+ if (LOG.isDebugEnabled() && !sortedNodeHosts.isEmpty()) {
+ LOG.debug("sortedNodeHosts:{}", sortedNodeHosts);
+ }
+ // update expirationTime
+ expirationTime = jsonObject.getLong(EXPIRATION_TIME);
+ } finally {
+ // unlock
+ writeLock.unlock();
+ }
+ }
+
+ private boolean isExpired() {
+ if (System.currentTimeMillis() > expirationTime && !sortedNodeHosts.isEmpty()) {
+ LOG.warn("loads metric request is expired at {}", new Date(expirationTime));
+ return true;
+ }
return false;
- }
- } catch (InvalidAllocationTagsQueryException e) {
- LOG.warn("Unable to allocate container", e);
- return false;
- }
- return tryCommit(getClusterResource(), resourceCommitRequest, false);
- }
- }
- return false;
- }
-
- // This assumes numContainers = 1 for the request.
- private ResourceCommitRequest
- createResourceCommitRequest(SchedulerApplicationAttempt appAttempt,
- SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) {
- ContainerAllocationProposal allocated =
- null;
- Resource resource = schedulingRequest.getResourceSizing().getResources();
- if (Resources.greaterThan(calculator, getClusterResource(),
- resource, Resources.none())) {
- ContainerId cId =
- ContainerId.newContainerId(appAttempt.getApplicationAttemptId(),
- appAttempt.getAppSchedulingInfo().getNewContainerId());
- Container container = BuilderUtils.newContainer(
- cId, schedulerNode.getNodeID(), schedulerNode.getHttpAddress(),
- resource, schedulingRequest.getPriority(), null,
- ExecutionType.GUARANTEED,
- schedulingRequest.getAllocationRequestId());
- RMContainer rmContainer = new RMContainerImpl(container,
- SchedulerRequestKey.extractFrom(container),
- appAttempt.getApplicationAttemptId(), container.getNodeId(),
- appAttempt.getUser(), rmContext, false);
- ((RMContainerImpl)rmContainer).setAllocationTags(
- new HashSet<>(schedulingRequest.getAllocationTags()));
-
- SchedulerContainer
- schedulerContainer = getSchedulerContainer(rmContainer, true);
- if (schedulerContainer == null) {
- allocated = null;
- } else {
- allocated = new ContainerAllocationProposal<>(schedulerContainer,
- null, null, NodeType.NODE_LOCAL, NodeType.NODE_LOCAL,
- SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, resource);
- }
- }
-
- if (null != allocated) {
- List>
- allocationsList = new ArrayList<>();
- allocationsList.add(allocated);
-
- return new ResourceCommitRequest<>(allocationsList, null, null);
- }
- return null;
- }
-
- @VisibleForTesting
- public ResourceCommitRequest
- createResourceCommitRequest(CSAssignment csAssignment) {
- ContainerAllocationProposal allocated =
- null;
- ContainerAllocationProposal reserved =
- null;
- List> released =
- null;
-
- if (Resources.greaterThan(calculator, getClusterResource(),
- csAssignment.getResource(), Resources.none())) {
- // Allocated something
- List allocations =
- csAssignment.getAssignmentInformation().getAllocationDetails();
- if (!allocations.isEmpty()) {
- RMContainer rmContainer = allocations.get(0).rmContainer;
- SchedulerContainer
- schedulerContainer = getSchedulerContainer(rmContainer, true);
- if (schedulerContainer == null) {
- allocated = null;
- // Decrease unconfirmed resource if app is alive
- FiCaSchedulerApp app = getApplicationAttempt(
- rmContainer.getApplicationAttemptId());
- if (app != null) {
- app.decUnconfirmedRes(rmContainer.getAllocatedResource());
- }
- } else {
- allocated = new ContainerAllocationProposal<>(schedulerContainer,
- getSchedulerContainersToRelease(csAssignment),
- getSchedulerContainer(
- csAssignment.getFulfilledReservedContainer(), false),
- csAssignment.getType(), csAssignment.getRequestLocalityType(),
- csAssignment.getSchedulingMode() != null ?
- csAssignment.getSchedulingMode() :
- SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY,
- csAssignment.getResource());
- }
- }
-
- // Reserved something
- List reservation =
- csAssignment.getAssignmentInformation().getReservationDetails();
- if (!reservation.isEmpty()) {
- RMContainer rmContainer = reservation.get(0).rmContainer;
- SchedulerContainer
- schedulerContainer = getSchedulerContainer(rmContainer, false);
- if (schedulerContainer == null) {
- reserved = null;
- } else {
- reserved = new ContainerAllocationProposal<>(schedulerContainer,
- getSchedulerContainersToRelease(csAssignment),
- getSchedulerContainer(
- csAssignment.getFulfilledReservedContainer(), false),
- csAssignment.getType(), csAssignment.getRequestLocalityType(),
- csAssignment.getSchedulingMode() != null ?
- csAssignment.getSchedulingMode() :
- SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY,
- csAssignment.getResource());
- }
- }
- }
-
- // When we don't need to allocate/reserve anything, we can feel free to
- // kill all to-release containers in the request.
- if (null == allocated && null == reserved) {
- released = getSchedulerContainersToRelease(csAssignment);
- }
-
- if (null != allocated || null != reserved || (null != released && !released
- .isEmpty())) {
- List>
- allocationsList = null;
- if (allocated != null) {
- allocationsList = new ArrayList<>();
- allocationsList.add(allocated);
- }
-
- List>
- reservationsList = null;
- if (reserved != null) {
- reservationsList = new ArrayList<>();
- reservationsList.add(reserved);
- }
-
- return new ResourceCommitRequest<>(allocationsList, reservationsList,
- released);
- }
-
- return null;
- }
-
- @Override
- public boolean tryCommit(Resource cluster, ResourceCommitRequest r,
- boolean updatePending) {
- long commitStart = System.nanoTime();
- ResourceCommitRequest request =
- (ResourceCommitRequest) r;
-
- ApplicationAttemptId attemptId = null;
-
- // We need to update unconfirmed allocated resource of application when
- // any container allocated.
- boolean updateUnconfirmedAllocatedResource =
- request.getContainersToAllocate() != null && !request
- .getContainersToAllocate().isEmpty();
-
- // find the application to accept and apply the ResourceCommitRequest
- if (request.anythingAllocatedOrReserved()) {
- ContainerAllocationProposal c =
- request.getFirstAllocatedOrReservedContainer();
- attemptId =
- c.getAllocatedOrReservedContainer().getSchedulerApplicationAttempt()
- .getApplicationAttemptId();
- } else {
- if (!request.getContainersToRelease().isEmpty()) {
- attemptId = request.getContainersToRelease().get(0)
- .getSchedulerApplicationAttempt().getApplicationAttemptId();
- }
- }
-
- LOG.debug("Try to commit allocation proposal={}", request);
-
- boolean isSuccess = false;
- if (attemptId != null) {
- FiCaSchedulerApp app = getApplicationAttempt(attemptId);
- // Required sanity check for attemptId - when async-scheduling enabled,
- // proposal might be outdated if AM failover just finished
- // and proposal queue was not be consumed in time
- if (app != null && attemptId.equals(app.getApplicationAttemptId())) {
- if (app.accept(cluster, request, updatePending)
- && app.apply(cluster, request, updatePending)) {
- long commitSuccess = System.nanoTime() - commitStart;
- CapacitySchedulerMetrics.getMetrics()
- .addCommitSuccess(commitSuccess);
- LOG.info("Allocation proposal accepted");
- isSuccess = true;
- } else{
- long commitFailed = System.nanoTime() - commitStart;
- CapacitySchedulerMetrics.getMetrics()
- .addCommitFailure(commitFailed);
- LOG.info("Failed to accept allocation proposal");
- }
-
- LOG.debug("Allocation proposal accepted={}, proposal={}", isSuccess,
- request);
-
- // Update unconfirmed allocated resource.
- if (updateUnconfirmedAllocatedResource) {
- app.decUnconfirmedRes(request.getTotalAllocatedResource());
- }
- }
- }
- return isSuccess;
- }
-
- public int getAsyncSchedulingPendingBacklogs() {
- if (scheduleAsynchronously) {
- return resourceCommitterService.getPendingBacklogs();
- }
- return 0;
- }
-
- @Override
- public CapacitySchedulerQueueManager getCapacitySchedulerQueueManager() {
- return this.queueManager;
- }
-
- public WorkflowPriorityMappingsManager getWorkflowPriorityMappingsManager() {
- return this.workflowPriorityMappingsMgr;
- }
-
- /**
- * Try to move a reserved container to a targetNode.
- * If the targetNode is reserved by another application (other than this one).
- * The previous reservation will be cancelled.
- *
- * @param toBeMovedContainer reserved container will be moved
- * @param targetNode targetNode
- * @return true if move succeeded. Return false if the targetNode is reserved by
- * a different container or move failed because of any other reasons.
- */
- public boolean moveReservedContainer(RMContainer toBeMovedContainer,
- FiCaSchedulerNode targetNode) {
- writeLock.lock();
- try {
- LOG.debug("Trying to move container={} to node={}",
- toBeMovedContainer, targetNode.getNodeID());
-
- FiCaSchedulerNode sourceNode = getNode(toBeMovedContainer.getNodeId());
- if (null == sourceNode) {
- LOG.debug("Failed to move reservation, cannot find source node={}",
- toBeMovedContainer.getNodeId());
- return false;
- }
+ }
- // Target node updated?
- if (getNode(targetNode.getNodeID()) != targetNode) {
- LOG.debug("Failed to move reservation, node updated or removed,"
- + " moving cancelled.");
- return false;
- }
+ /**
+ * is node overload
+ *
+ * @param nodeHost String
+ * @return true:node overload; false:node not overload
+ */
+ public boolean isNodeOverLoad(String nodeHost) {
+ try {
+ readLock.lock();
+ if (isExpired()) {
+ return false;
+ }
+ return overLoadNodeHosts.contains(nodeHost);
+ } finally {
+ readLock.unlock();
+ }
+ }
- // Target node's reservation status changed?
- if (targetNode.getReservedContainer() != null) {
- LOG.debug("Target node's reservation status changed,"
- + " moving cancelled.");
- return false;
- }
+ /**
+ * getSortedNodeHosts
+ *
+ * @return sortedNodeHosts
+ */
+ public List getSortedNodeHosts() {
+ try {
+ readLock.lock();
+ if (isExpired()) {
+ return Collections.emptyList();
+ }
+ return sortedNodeHosts;
+ } finally {
+ readLock.unlock();
+ }
+ }
+ }
- FiCaSchedulerApp app = getApplicationAttempt(
- toBeMovedContainer.getApplicationAttemptId());
- if (null == app) {
- LOG.debug("Cannot find to-be-moved container's application={}",
- toBeMovedContainer.getApplicationAttemptId());
- return false;
- }
-
- // finally, move the reserved container
- return app.moveReservation(toBeMovedContainer, sourceNode, targetNode);
- } finally {
- writeLock.unlock();
- }
- }
-
- @Override
- public long checkAndGetApplicationLifetime(String queueName,
- long lifetimeRequestedByApp) {
- readLock.lock();
- try {
- CSQueue queue = getQueue(queueName);
- if (queue == null || !(queue instanceof LeafQueue)) {
- return lifetimeRequestedByApp;
- }
-
- long defaultApplicationLifetime =
- ((LeafQueue) queue).getDefaultApplicationLifetime();
- long maximumApplicationLifetime =
- ((LeafQueue) queue).getMaximumApplicationLifetime();
-
- // check only for maximum, that's enough because default can't
- // exceed maximum
- if (maximumApplicationLifetime <= 0) {
- return (lifetimeRequestedByApp <= 0) ? defaultApplicationLifetime :
- lifetimeRequestedByApp;
- }
-
- if (lifetimeRequestedByApp <= 0) {
- return defaultApplicationLifetime;
- } else if (lifetimeRequestedByApp > maximumApplicationLifetime) {
- return maximumApplicationLifetime;
- }
- return lifetimeRequestedByApp;
- } finally {
- readLock.unlock();
- }
- }
-
- @Override
- public long getMaximumApplicationLifetime(String queueName) {
- CSQueue queue = getQueue(queueName);
- if (queue == null || !(queue instanceof LeafQueue)) {
- if (isAmbiguous(queueName)) {
- LOG.error("Ambiguous queue reference: " + queueName
- + " please use full queue path instead.");
- } else {
- LOG.error("Unknown queue: " + queueName);
- }
- return -1;
- }
- // In seconds
- return ((LeafQueue) queue).getMaximumApplicationLifetime();
- }
-
- @Override
- public boolean isConfigurationMutable() {
- return csConfProvider instanceof MutableConfigurationProvider;
- }
-
- @Override
- public MutableConfigurationProvider getMutableConfProvider() {
- if (isConfigurationMutable()) {
- return (MutableConfigurationProvider) csConfProvider;
- }
- return null;
- }
-
- private LeafQueue autoCreateLeafQueue(
- ApplicationPlacementContext placementContext)
- throws IOException, YarnException {
-
- AutoCreatedLeafQueue autoCreatedLeafQueue = null;
-
- String leafQueueName = placementContext.getQueue();
- String parentQueueName = placementContext.getParentQueue();
-
- if (!StringUtils.isEmpty(parentQueueName)) {
- CSQueue parentQueue = getQueue(parentQueueName);
-
- if (parentQueue != null && conf.isAutoCreateChildQueueEnabled(
- parentQueue.getQueuePath())) {
-
- ManagedParentQueue autoCreateEnabledParentQueue =
- (ManagedParentQueue) parentQueue;
- autoCreatedLeafQueue = new AutoCreatedLeafQueue(this, leafQueueName,
- autoCreateEnabledParentQueue);
-
- addQueue(autoCreatedLeafQueue);
-
- } else{
- throw new SchedulerDynamicEditException(
- "Could not auto-create leaf queue for " + leafQueueName
- + ". Queue mapping specifies an invalid parent queue "
- + "which does not exist "
- + parentQueueName);
- }
- } else{
- throw new SchedulerDynamicEditException(
- "Could not auto-create leaf queue for " + leafQueueName
- + ". Queue mapping does not specify"
- + " which parent queue it needs to be created under.");
- }
- return autoCreatedLeafQueue;
- }
-
- @Override
- public void resetSchedulerMetrics() {
- CapacitySchedulerMetrics.destroy();
- }
-
- public boolean isMultiNodePlacementEnabled() {
- return multiNodePlacementEnabled;
- }
-
- public int getNumAsyncSchedulerThreads() {
- return asyncSchedulerThreads == null ? 0 : asyncSchedulerThreads.size();
- }
-
- @VisibleForTesting
- public void setMaxRunningAppsEnforcer(CSMaxRunningAppsEnforcer enforcer) {
- this.maxRunningEnforcer = enforcer;
- }
-
- /**
- * Returning true as capacity scheduler supports placement constraints.
- */
- @Override
- public boolean placementConstraintEnabled() {
- return true;
- }
-
- @VisibleForTesting
- public void setQueueManager(CapacitySchedulerQueueManager qm) {
- this.queueManager = qm;
- }
+ @VisibleForTesting
+ public PlacementRule getUserGroupMappingPlacementRule() throws IOException {
+ readLock.lock();
+ try {
+ UserGroupMappingPlacementRule ugRule = new UserGroupMappingPlacementRule();
+ ugRule.initialize(this);
+ return ugRule;
+ } finally {
+ readLock.unlock();
+ }
+ }
+
+ public PlacementRule getAppNameMappingPlacementRule() throws IOException {
+ readLock.lock();
+ try {
+ AppNameMappingPlacementRule anRule = new AppNameMappingPlacementRule();
+ anRule.initialize(this);
+ return anRule;
+ } finally {
+ readLock.unlock();
+ }
+ }
+
+ @VisibleForTesting
+ public void updatePlacementRules() throws IOException {
+ // Initialize placement rules
+ Collection placementRuleStrs = conf.getStringCollection(
+ YarnConfiguration.QUEUE_PLACEMENT_RULES);
+ List placementRules = new ArrayList<>();
+ Set distinguishRuleSet = CapacitySchedulerConfigValidator
+ .validatePlacementRules(placementRuleStrs);
+
+ // add UserGroupMappingPlacementRule if empty,default value of
+ // yarn.scheduler.queue-placement-rules is user-group
+ if (distinguishRuleSet.isEmpty()) {
+ distinguishRuleSet.add(YarnConfiguration.USER_GROUP_PLACEMENT_RULE);
+ }
+
+ placementRuleStrs = new ArrayList<>(distinguishRuleSet);
+
+ for (String placementRuleStr : placementRuleStrs) {
+ switch (placementRuleStr) {
+ case YarnConfiguration.USER_GROUP_PLACEMENT_RULE:
+ PlacementRule ugRule = getUserGroupMappingPlacementRule();
+ if (null != ugRule) {
+ placementRules.add(ugRule);
+ }
+ break;
+ case YarnConfiguration.APP_NAME_PLACEMENT_RULE:
+ PlacementRule anRule = getAppNameMappingPlacementRule();
+ if (null != anRule) {
+ placementRules.add(anRule);
+ }
+ break;
+ default:
+ boolean isMappingNotEmpty;
+ try {
+ PlacementRule rule = PlacementFactory.getPlacementRule(
+ placementRuleStr, conf);
+ if (null != rule) {
+ try {
+ isMappingNotEmpty = rule.initialize(this);
+ } catch (IOException ie) {
+ throw new IOException(ie);
+ }
+ if (isMappingNotEmpty) {
+ placementRules.add(rule);
+ }
+ }
+ } catch (ClassNotFoundException cnfe) {
+ throw new IOException(cnfe);
+ }
+ }
+ }
+
+ rmContext.getQueuePlacementManager().updateRules(placementRules);
+ }
+
+ @Lock(CapacityScheduler.class)
+ private void initializeQueues(CapacitySchedulerConfiguration conf)
+ throws YarnException {
+ try {
+ this.queueManager.initializeQueues(conf);
+
+ updatePlacementRules();
+
+ this.workflowPriorityMappingsMgr.initialize(this);
+
+ // Notify Preemption Manager
+ preemptionManager.refreshQueues(null, this.getRootQueue());
+ } catch (Exception e) {
+ throw new YarnException("Failed to initialize queues", e);
+ }
+ }
+
+ @Lock(CapacityScheduler.class)
+ private void reinitializeQueues(CapacitySchedulerConfiguration newConf)
+ throws IOException {
+ this.queueManager.reinitializeQueues(newConf);
+ updatePlacementRules();
+
+ this.workflowPriorityMappingsMgr.initialize(this);
+
+ // Notify Preemption Manager
+ preemptionManager.refreshQueues(null, this.getRootQueue());
+ }
+
+ @Override
+ public CSQueue getQueue(String queueName) {
+ if (queueName == null) {
+ return null;
+ }
+ return this.queueManager.getQueue(queueName);
+ }
+
+ /**
+ * Returns the normalized queue name, which should be used for internal
+ * queue references. Currently this is the fullQueuename which disambiguously
+ * identifies a queue.
+ * @param name Name of the queue to be normalized
+ * @return The normalized (full name) of the queue
+ */
+ public String normalizeQueueName(String name) {
+ if (this.queueManager == null) {
+ return name;
+ }
+ return this.queueManager.normalizeQueueName(name);
+ }
+
+ /**
+ * Determines if a short queue name reference is ambiguous, if there are at
+ * least two queues with the same name, it is considered ambiguous. Otherwise
+ * it is not.
+ * @param queueName The name of the queue to check for ambiguity
+ * @return true if there are at least 2 queues with the same name
+ */
+ public boolean isAmbiguous(String queueName) {
+ return this.queueManager.isAmbiguous(queueName);
+ }
+
+ private void addApplicationOnRecovery(ApplicationId applicationId,
+ String queueName, String user,
+ Priority priority, ApplicationPlacementContext placementContext) {
+ writeLock.lock();
+ try {
+ //check if the queue needs to be auto-created during recovery
+ CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user,
+ queueName, placementContext, true);
+
+ if (queue == null) {
+ //During a restart, this indicates a queue was removed, which is
+ //not presently supported
+ if (!getConfiguration().shouldAppFailFast(getConfig())) {
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.KILL,
+ "Application killed on recovery as it"
+ + " was submitted to queue " + queueName
+ + " which no longer exists after restart."));
+ return;
+ } else {
+ String queueErrorMsg = "Queue named " + queueName + " missing "
+ + "during application recovery."
+ + " Queue removal during recovery is not presently "
+ + "supported by the capacity scheduler, please "
+ + "restart with all queues configured"
+ + " which were present before shutdown/restart.";
+ LOG.error(FATAL, queueErrorMsg);
+ throw new QueueInvalidException(queueErrorMsg);
+ }
+ }
+ if (!(queue instanceof LeafQueue)) {
+ // During RM restart, this means leaf queue was converted to a parent
+ // queue, which is not supported for running apps.
+ if (!getConfiguration().shouldAppFailFast(getConfig())) {
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.KILL,
+ "Application killed on recovery as it was "
+ + "submitted to queue " + queueName
+ + " which is no longer a leaf queue after restart."));
+ return;
+ } else {
+ String queueErrorMsg = "Queue named " + queueName
+ + " is no longer a leaf queue during application recovery."
+ + " Changing a leaf queue to a parent queue during recovery is"
+ + " not presently supported by the capacity scheduler. Please"
+ + " restart with leaf queues before shutdown/restart continuing"
+ + " as leaf queues.";
+ LOG.error(FATAL, queueErrorMsg);
+ throw new QueueInvalidException(queueErrorMsg);
+ }
+ }
+ // When recovering apps in this queue but queue is in STOPPED state,
+ // that means its previous state was DRAINING. So we auto transit
+ // the state to DRAINING for recovery.
+ if (queue.getState() == QueueState.STOPPED) {
+ ((LeafQueue) queue).recoverDrainingState();
+ }
+ // Submit to the queue
+ try {
+ queue.submitApplication(applicationId, user, queueName);
+ } catch (AccessControlException ace) {
+ // Ignore the exception for recovered app as the app was previously
+ // accepted.
+ LOG.warn("AccessControlException received when trying to recover "
+ + applicationId + " in queue " + queueName + " for user " + user
+ + ". Since the app was in the queue prior to recovery, the Capacity"
+ + " Scheduler will recover the app anyway.", ace);
+ }
+ queue.getMetrics().submitApp(user);
+ SchedulerApplication application =
+ new SchedulerApplication(queue, user, priority);
+ applications.put(applicationId, application);
+ LOG.info("Accepted application " + applicationId + " from user: " + user
+ + ", in queue: " + queueName);
+ LOG.debug(
+ applicationId + " is recovering. Skip notifying APP_ACCEPTED");
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private CSQueue getOrCreateQueueFromPlacementContext(ApplicationId
+ applicationId, String user, String queueName,
+ ApplicationPlacementContext placementContext,
+ boolean isRecovery) {
+
+ CSQueue queue = getQueue(queueName);
+
+ if (queue == null) {
+ if (placementContext != null && placementContext.hasParentQueue()) {
+ try {
+ return autoCreateLeafQueue(placementContext);
+ } catch (YarnException | IOException e) {
+ if (isRecovery) {
+ if (!getConfiguration().shouldAppFailFast(getConfig())) {
+ LOG.error("Could not auto-create leaf queue " + queueName +
+ " due to : ", e);
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.KILL,
+ "Application killed on recovery"
+ + " as it was submitted to queue " + queueName
+ + " which could not be auto-created"));
+ } else {
+ String queueErrorMsg =
+ "Queue named " + queueName + " could not be "
+ + "auto-created during application recovery.";
+ LOG.error(FATAL, queueErrorMsg, e);
+ throw new QueueInvalidException(queueErrorMsg);
+ }
+ } else {
+ LOG.error("Could not auto-create leaf queue due to : ", e);
+ final String message =
+ "Application " + applicationId + " submission by user : "
+ + user
+ + " to queue : " + queueName + " failed : " + e
+ .getMessage();
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ }
+ }
+ }
+ }
+ return queue;
+ }
+
+ private void addApplication(ApplicationId applicationId, String queueName,
+ String user, Priority priority,
+ ApplicationPlacementContext placementContext) {
+ writeLock.lock();
+ try {
+ if (isSystemAppsLimitReached()) {
+ String message = "Maximum system application limit reached,"
+ + "cannot accept submission of application: " + applicationId;
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return;
+ }
+
+ //Could be a potential auto-created leaf queue
+ CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user,
+ queueName, placementContext, false);
+
+ if (queue == null) {
+ String message;
+ if (isAmbiguous(queueName)) {
+ message = "Application " + applicationId
+ + " submitted by user " + user
+ + " to ambiguous queue: " + queueName
+ + " please use full queue path instead.";
+ } else {
+ message =
+ "Application " + applicationId + " submitted by user " + user
+ + " to unknown queue: " + queueName;
+ }
+
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return;
+ }
+
+ if (!(queue instanceof LeafQueue)) {
+ String message =
+ "Application " + applicationId + " submitted by user : " + user
+ + " to non-leaf queue : " + queueName;
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return;
+ } else if (queue instanceof AutoCreatedLeafQueue && queue
+ .getParent() instanceof ManagedParentQueue) {
+
+ //If queue already exists and auto-queue creation was not required,
+ //placement context should not be null
+ if (placementContext == null) {
+ String message =
+ "Application " + applicationId + " submission by user : " + user
+ + " to specified queue : " + queueName + " is prohibited. "
+ + "Verify automatic queue mapping for user exists in " +
+ QUEUE_MAPPING;
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return;
+ // For a queue which exists already and
+ // not auto-created above, then its parent queue should match
+ // the parent queue specified in queue mapping
+ } else if (!queue.getParent().getQueueShortName().equals(
+ placementContext.getParentQueue())
+ && !queue.getParent().getQueuePath().equals(
+ placementContext.getParentQueue())) {
+ String message =
+ "Auto created Leaf queue " + placementContext.getQueue() + " "
+ + "already exists under queue : " + queue
+ .getParent().getQueueShortName()
+ + ". But Queue mapping configuration " +
+ CapacitySchedulerConfiguration.QUEUE_MAPPING + " has been "
+ + "updated to a different parent queue : "
+ + placementContext.getParentQueue()
+ + " for the specified user : " + user;
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return;
+ }
+ }
+
+ try {
+ priority = workflowPriorityMappingsMgr.mapWorkflowPriorityForApp(
+ applicationId, queue, user, priority);
+ } catch (YarnException e) {
+ String message = "Failed to submit application " + applicationId +
+ " submitted by user " + user + " reason: " + e.getMessage();
+ this.rmContext.getDispatcher().getEventHandler().handle(new RMAppEvent(
+ applicationId, RMAppEventType.APP_REJECTED, message));
+ return;
+ }
+
+ // Submit to the queue
+ try {
+ queue.submitApplication(applicationId, user, queueName);
+ } catch (AccessControlException ace) {
+ LOG.info("Failed to submit application " + applicationId + " to queue "
+ + queueName + " from user " + user, ace);
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ ace.toString()));
+ return;
+ }
+ // update the metrics
+ queue.getMetrics().submitApp(user);
+ SchedulerApplication application =
+ new SchedulerApplication(queue, user, priority);
+ applications.put(applicationId, application);
+ LOG.info("Accepted application " + applicationId + " from user: " + user
+ + ", in queue: " + queueName);
+ rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED));
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void addApplicationAttempt(
+ ApplicationAttemptId applicationAttemptId,
+ boolean transferStateFromPreviousAttempt,
+ boolean isAttemptRecovering) {
+ writeLock.lock();
+ try {
+ SchedulerApplication application = applications.get(
+ applicationAttemptId.getApplicationId());
+ if (application == null) {
+ LOG.warn("Application " + applicationAttemptId.getApplicationId()
+ + " cannot be found in scheduler.");
+ return;
+ }
+ CSQueue queue = (CSQueue) application.getQueue();
+
+ FiCaSchedulerApp attempt = new FiCaSchedulerApp(applicationAttemptId,
+ application.getUser(), queue, queue.getAbstractUsersManager(),
+ rmContext, application.getPriority(), isAttemptRecovering,
+ activitiesManager);
+ if (transferStateFromPreviousAttempt) {
+ attempt.transferStateFromPreviousAttempt(
+ application.getCurrentAppAttempt());
+ }
+ application.setCurrentAppAttempt(attempt);
+
+ // Update attempt priority to the latest to avoid race condition i.e
+ // SchedulerApplicationAttempt is created with old priority but it is not
+ // set to SchedulerApplication#setCurrentAppAttempt.
+ // Scenario would occur is
+ // 1. SchdulerApplicationAttempt is created with old priority.
+ // 2. updateApplicationPriority() updates SchedulerApplication. Since
+ // currentAttempt is null, it just return.
+ // 3. ScheduelerApplcationAttempt is set in
+ // SchedulerApplication#setCurrentAppAttempt.
+ attempt.setPriority(application.getPriority());
+
+ maxRunningEnforcer.checkRunnabilityWithUpdate(attempt);
+ maxRunningEnforcer.trackApp(attempt);
+
+ queue.submitApplicationAttempt(attempt, application.getUser());
+ LOG.info("Added Application Attempt " + applicationAttemptId
+ + " to scheduler from user " + application.getUser() + " in queue "
+ + queue.getQueuePath());
+ if (isAttemptRecovering) {
+ LOG.debug("{} is recovering. Skipping notifying ATTEMPT_ADDED",
+ applicationAttemptId);
+ } else {
+ rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppAttemptEvent(applicationAttemptId,
+ RMAppAttemptEventType.ATTEMPT_ADDED));
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void doneApplication(ApplicationId applicationId,
+ RMAppState finalState) {
+ writeLock.lock();
+ try {
+ SchedulerApplication application = applications.get(
+ applicationId);
+ if (application == null) {
+ // The AppRemovedSchedulerEvent maybe sent on recovery for completed
+ // apps, ignore it.
+ LOG.warn("Couldn't find application " + applicationId);
+ return;
+ }
+ CSQueue queue = (CSQueue) application.getQueue();
+ if (!(queue instanceof LeafQueue)) {
+ LOG.error("Cannot finish application " + "from non-leaf queue: " + queue
+ .getQueuePath());
+ } else {
+ queue.finishApplication(applicationId, application.getUser());
+ }
+ application.stop(finalState);
+ applications.remove(applicationId);
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void doneApplicationAttempt(
+ ApplicationAttemptId applicationAttemptId,
+ RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers) {
+ writeLock.lock();
+ try {
+ LOG.info("Application Attempt " + applicationAttemptId + " is done."
+ + " finalState=" + rmAppAttemptFinalState);
+
+ FiCaSchedulerApp attempt = getApplicationAttempt(applicationAttemptId);
+ SchedulerApplication application = applications.get(
+ applicationAttemptId.getApplicationId());
+
+ if (application == null || attempt == null) {
+ LOG.info(
+ "Unknown application " + applicationAttemptId + " has completed!");
+ return;
+ }
+
+ // Release all the allocated, acquired, running containers
+ for (RMContainer rmContainer : attempt.getLiveContainers()) {
+ if (keepContainers && rmContainer.getState().equals(
+ RMContainerState.RUNNING)) {
+ // do not kill the running container in the case of work-preserving AM
+ // restart.
+ LOG.info("Skip killing " + rmContainer.getContainerId());
+ continue;
+ }
+ super.completedContainer(rmContainer, SchedulerUtils
+ .createAbnormalContainerStatus(rmContainer.getContainerId(),
+ SchedulerUtils.COMPLETED_APPLICATION),
+ RMContainerEventType.KILL);
+ }
+
+ // Release all reserved containers
+ for (RMContainer rmContainer : attempt.getReservedContainers()) {
+ super.completedContainer(rmContainer, SchedulerUtils
+ .createAbnormalContainerStatus(rmContainer.getContainerId(),
+ "Application Complete"), RMContainerEventType.KILL);
+ }
+
+ // Clean up pending requests, metrics etc.
+ attempt.stop(rmAppAttemptFinalState);
+
+ // Inform the queue
+ Queue queue = attempt.getQueue();
+ CSQueue csQueue = (CSQueue) queue;
+ if (!(csQueue instanceof LeafQueue)) {
+ LOG.error(
+ "Cannot finish application " + "from non-leaf queue: "
+ + csQueue.getQueuePath());
+ } else {
+ csQueue.finishApplicationAttempt(attempt, csQueue.getQueuePath());
+
+ maxRunningEnforcer.untrackApp(attempt);
+ if (attempt.isRunnable()) {
+ maxRunningEnforcer.updateRunnabilityOnAppRemoval(attempt);
+ }
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ /**
+ * Normalize a list of SchedulingRequest.
+ *
+ * @param asks scheduling request
+ */
+ private void normalizeSchedulingRequests(List asks) {
+ if (asks == null) {
+ return;
+ }
+ Resource maxAllocation = getMaximumResourceCapability();
+ for (SchedulingRequest ask : asks) {
+ ResourceSizing sizing = ask.getResourceSizing();
+ if (sizing != null && sizing.getResources() != null) {
+ sizing.setResources(
+ getNormalizedResource(sizing.getResources(), maxAllocation));
+ }
+ }
+ }
+
+ @Override
+ @Lock(Lock.NoLock.class)
+ public Allocation allocate(ApplicationAttemptId applicationAttemptId,
+ List ask, List schedulingRequests,
+ List release, List blacklistAdditions,
+ List blacklistRemovals, ContainerUpdates updateRequests) {
+ FiCaSchedulerApp application = getApplicationAttempt(applicationAttemptId);
+ if (application == null) {
+ LOG.error("Calling allocate on removed or non existent application " +
+ applicationAttemptId.getApplicationId());
+ return EMPTY_ALLOCATION;
+ }
+
+ // The allocate may be the leftover from previous attempt, and it will
+ // impact current attempt, such as confuse the request and allocation for
+ // current attempt's AM container.
+ // Note outside precondition check for the attempt id may be
+ // outdated here, so double check it here is necessary.
+ if (!application.getApplicationAttemptId().equals(applicationAttemptId)) {
+ LOG.error("Calling allocate on previous or removed " +
+ "or non existent application attempt " + applicationAttemptId);
+ return EMPTY_ALLOCATION;
+ }
+
+ // Handle all container updates
+ handleContainerUpdates(application, updateRequests);
+
+ // Release containers
+ releaseContainers(release, application);
+
+ LeafQueue updateDemandForQueue = null;
+
+ // Sanity check for new allocation requests
+ normalizeResourceRequests(ask);
+
+ // Normalize scheduling requests
+ normalizeSchedulingRequests(schedulingRequests);
+
+ Allocation allocation;
+
+ // make sure we aren't stopping/removing the application
+ // when the allocate comes in
+ application.getWriteLock().lock();
+ try {
+ if (application.isStopped()) {
+ return EMPTY_ALLOCATION;
+ }
+
+ // Process resource requests
+ if (!ask.isEmpty() || (schedulingRequests != null && !schedulingRequests
+ .isEmpty())) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ "allocate: pre-update " + applicationAttemptId + " ask size ="
+ + ask.size());
+ application.showRequests();
+ }
+
+ // Update application requests
+ if (application.updateResourceRequests(ask) || application
+ .updateSchedulingRequests(schedulingRequests)) {
+ updateDemandForQueue = (LeafQueue) application.getQueue();
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("allocate: post-update");
+ application.showRequests();
+ }
+ }
+
+ application.updateBlacklist(blacklistAdditions, blacklistRemovals);
+
+ allocation = application.getAllocation(getResourceCalculator(),
+ getClusterResource(), getMinimumResourceCapability());
+ } finally {
+ application.getWriteLock().unlock();
+ }
+
+ if (updateDemandForQueue != null && !application
+ .isWaitingForAMContainer()) {
+ updateDemandForQueue.getOrderingPolicy().demandUpdated(application);
+ }
+
+ LOG.debug("Allocation for application {} : {} with cluster resource : {}",
+ applicationAttemptId, allocation, getClusterResource());
+ return allocation;
+ }
+
+ @Override
+ @Lock(Lock.NoLock.class)
+ public QueueInfo getQueueInfo(String queueName,
+ boolean includeChildQueues, boolean recursive)
+ throws IOException {
+ CSQueue queue = null;
+ queue = this.getQueue(queueName);
+ if (queue == null) {
+ if (isAmbiguous(queueName)) {
+ throw new IOException("Ambiguous queue reference: " + queueName
+ + " please use full queue path instead.");
+ } else {
+ throw new IOException("Unknown queue: " + queueName);
+ }
+
+ }
+ return queue.getQueueInfo(includeChildQueues, recursive);
+ }
+
+ @Override
+ @Lock(Lock.NoLock.class)
+ public List getQueueUserAclInfo() {
+ UserGroupInformation user = null;
+ try {
+ user = UserGroupInformation.getCurrentUser();
+ } catch (IOException ioe) {
+ // should never happen
+ return new ArrayList();
+ }
+
+ return getRootQueue().getQueueUserAclInfo(user);
+ }
+
+ @Override
+ protected void nodeUpdate(RMNode rmNode) {
+ long begin = System.nanoTime();
+ readLock.lock();
+ try {
+ setLastNodeUpdateTime(Time.now());
+ super.nodeUpdate(rmNode);
+ } finally {
+ readLock.unlock();
+ }
+
+ // Try to do scheduling
+ if (!scheduleAsynchronously) {
+ writeLock.lock();
+ try {
+ // reset allocation and reservation stats before we start doing any
+ // work
+ updateSchedulerHealth(lastNodeUpdateTime, rmNode.getNodeID(),
+ CSAssignment.NULL_ASSIGNMENT);
+ if (!loadsMetricServerRequestThread.isNodeOverLoad(rmNode.getHostName())) {
+ allocateContainersToNode(rmNode.getNodeID(), true);
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ long latency = System.nanoTime() - begin;
+ CapacitySchedulerMetrics.getMetrics().addNodeUpdate(latency);
+ }
+
+ /**
+ * Process resource update on a node.
+ */
+ private void updateNodeAndQueueResource(RMNode nm,
+ ResourceOption resourceOption) {
+ writeLock.lock();
+ try {
+ updateNodeResource(nm, resourceOption);
+ Resource clusterResource = getClusterResource();
+ getRootQueue().updateClusterResource(clusterResource,
+ new ResourceLimits(clusterResource));
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ /**
+ * Process node labels update on a node.
+ */
+ private void updateLabelsOnNode(NodeId nodeId,
+ Set newLabels) {
+ FiCaSchedulerNode node = nodeTracker.getNode(nodeId);
+ if (null == node) {
+ return;
+ }
+
+ // Get new partition, we have only one partition per node
+ String newPartition;
+ if (newLabels.isEmpty()) {
+ newPartition = RMNodeLabelsManager.NO_LABEL;
+ } else {
+ newPartition = newLabels.iterator().next();
+ }
+
+ // old partition as well
+ String oldPartition = node.getPartition();
+
+ // Update resources of these containers
+ for (RMContainer rmContainer : node.getCopiedListOfRunningContainers()) {
+ FiCaSchedulerApp application = getApplicationAttempt(
+ rmContainer.getApplicationAttemptId());
+ if (null != application) {
+ application.nodePartitionUpdated(rmContainer, oldPartition,
+ newPartition);
+ } else {
+ LOG.warn("There's something wrong, some RMContainers running on"
+ + " a node, but we cannot find SchedulerApplicationAttempt "
+ + "for it. Node=" + node.getNodeID() + " applicationAttemptId="
+ + rmContainer.getApplicationAttemptId());
+ continue;
+ }
+ }
+
+ // Unreserve container on this node
+ RMContainer reservedContainer = node.getReservedContainer();
+ if (null != reservedContainer) {
+ killReservedContainer(reservedContainer);
+ }
+
+ // Update node labels after we've done this
+ node.updateLabels(newLabels);
+ }
+
+ private void updateSchedulerHealth(long now, NodeId nodeId,
+ CSAssignment assignment) {
+ List allocations =
+ assignment.getAssignmentInformation().getAllocationDetails();
+ List reservations =
+ assignment.getAssignmentInformation().getReservationDetails();
+ // Get nodeId from allocated container if incoming argument is null.
+ NodeId updatedNodeid = (nodeId == null)
+ ? allocations.get(allocations.size() - 1).rmContainer.getNodeId()
+ : nodeId;
+
+ if (!allocations.isEmpty()) {
+ ContainerId allocatedContainerId =
+ allocations.get(allocations.size() - 1).containerId;
+ String allocatedQueue = allocations.get(allocations.size() - 1).queue;
+ schedulerHealth.updateAllocation(now, updatedNodeid, allocatedContainerId,
+ allocatedQueue);
+ }
+ if (!reservations.isEmpty()) {
+ ContainerId reservedContainerId =
+ reservations.get(reservations.size() - 1).containerId;
+ String reservedQueue = reservations.get(reservations.size() - 1).queue;
+ schedulerHealth.updateReservation(now, updatedNodeid, reservedContainerId,
+ reservedQueue);
+ }
+ schedulerHealth.updateSchedulerReservationCounts(assignment
+ .getAssignmentInformation().getNumReservations());
+ schedulerHealth.updateSchedulerAllocationCounts(assignment
+ .getAssignmentInformation().getNumAllocations());
+ schedulerHealth.updateSchedulerRunDetails(now, assignment
+ .getAssignmentInformation().getAllocated(), assignment
+ .getAssignmentInformation().getReserved());
+ }
+
+ private boolean canAllocateMore(CSAssignment assignment, int offswitchCount,
+ int assignedContainers) {
+ // Current assignment shouldn't be empty
+ if (assignment == null
+ || Resources.equals(assignment.getResource(), Resources.none())) {
+ return false;
+ }
+
+ // offswitch assignment should be under threshold
+ if (offswitchCount >= offswitchPerHeartbeatLimit) {
+ return false;
+ }
+
+ // And it should not be a reserved container
+ if (assignment.getAssignmentInformation().getNumReservations() > 0) {
+ return false;
+ }
+
+ // assignMultipleEnabled should be ON,
+ // and assignedContainers should be under threshold
+ return assignMultipleEnabled
+ && (maxAssignPerHeartbeat == -1
+ || assignedContainers < maxAssignPerHeartbeat);
+ }
+
+ private CandidateNodeSet getCandidateNodeSet(
+ FiCaSchedulerNode node) {
+ CandidateNodeSet candidates = null;
+ candidates = new SimpleCandidateNodeSet<>(node);
+ if (multiNodePlacementEnabled) {
+ Map nodesByPartition = new HashMap<>();
+ List nodes = nodeTracker
+ .getNodesPerPartition(node.getPartition());
+ if (nodes != null && !nodes.isEmpty()) {
+ nodes.forEach(n -> nodesByPartition.put(n.getNodeID(), n));
+ candidates = new SimpleCandidateNodeSet(
+ nodesByPartition, node.getPartition());
+ }
+ }
+ return candidates;
+ }
+
+ /**
+ * We need to make sure when doing allocation, Node should be existed
+ * And we will construct a {@link CandidateNodeSet} before proceeding
+ */
+ private void allocateContainersToNode(NodeId nodeId,
+ boolean withNodeHeartbeat) {
+ FiCaSchedulerNode node = getNode(nodeId);
+ if (null != node) {
+ int offswitchCount = 0;
+ int assignedContainers = 0;
+
+ CandidateNodeSet candidates = getCandidateNodeSet(
+ node);
+ CSAssignment assignment = allocateContainersToNode(candidates,
+ withNodeHeartbeat);
+ // Only check if we can allocate more container on the same node when
+ // scheduling is triggered by node heartbeat
+ if (null != assignment && withNodeHeartbeat) {
+ if (assignment.getType() == NodeType.OFF_SWITCH) {
+ offswitchCount++;
+ }
+
+ if (Resources.greaterThan(calculator, getClusterResource(),
+ assignment.getResource(), Resources.none())) {
+ assignedContainers++;
+ }
+
+ while (canAllocateMore(assignment, offswitchCount,
+ assignedContainers)) {
+ // Try to see if it is possible to allocate multiple container for
+ // the same node heartbeat
+ assignment = allocateContainersToNode(candidates, true);
+
+ if (null != assignment
+ && assignment.getType() == NodeType.OFF_SWITCH) {
+ offswitchCount++;
+ }
+
+ if (null != assignment
+ && Resources.greaterThan(calculator, getClusterResource(),
+ assignment.getResource(), Resources.none())) {
+ assignedContainers++;
+ }
+ }
+
+ if (offswitchCount >= offswitchPerHeartbeatLimit) {
+ LOG.debug("Assigned maximum number of off-switch containers: {},"
+ + " assignments so far: {}", offswitchCount, assignment);
+ }
+ }
+ }
+ }
+
+ /*
+ * Logics of allocate container on a single node (Old behavior)
+ */
+ private CSAssignment allocateContainerOnSingleNode(
+ CandidateNodeSet candidates, FiCaSchedulerNode node,
+ boolean withNodeHeartbeat) {
+ LOG.debug("Trying to schedule on node: {}, available: {}",
+ node.getNodeName(), node.getUnallocatedResource());
+
+ // Backward compatible way to make sure previous behavior which allocation
+ // driven by node heartbeat works.
+ if (getNode(node.getNodeID()) != node) {
+ LOG.error("Trying to schedule on a removed node, please double check, "
+ + "nodeId=" + node.getNodeID());
+ ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
+ "", getRootQueue().getQueuePath(), ActivityState.REJECTED,
+ ActivityDiagnosticConstant.INIT_CHECK_SINGLE_NODE_REMOVED);
+ ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
+ node);
+ return null;
+ }
+
+ // Assign new containers...
+ // 1. Check for reserved applications
+ // 2. Schedule if there are no reservations
+ RMContainer reservedContainer = node.getReservedContainer();
+ if (reservedContainer != null) {
+ allocateFromReservedContainer(node, withNodeHeartbeat, reservedContainer);
+ // Do not schedule if there are any reservations to fulfill on the node
+ LOG.debug("Skipping scheduling since node {} is reserved by"
+ + " application {}", node.getNodeID(), reservedContainer.
+ getContainerId().getApplicationAttemptId());
+ return null;
+ }
+
+ // First check if we can schedule
+ // When this time look at one node only, try schedule if the node
+ // has any available or killable resource
+ if (calculator.computeAvailableContainers(Resources
+ .add(node.getUnallocatedResource(), node.getTotalKillableResources()),
+ minimumAllocation) <= 0) {
+ LOG.debug("This node " + node.getNodeID() + " doesn't have sufficient "
+ + "available or preemptible resource for minimum allocation");
+ ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
+ "", getRootQueue().getQueuePath(), ActivityState.REJECTED,
+ ActivityDiagnosticConstant.
+ INIT_CHECK_SINGLE_NODE_RESOURCE_INSUFFICIENT);
+ ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
+ node);
+ return null;
+ }
+
+ return allocateOrReserveNewContainers(candidates, withNodeHeartbeat);
+ }
+
+ private void allocateFromReservedContainer(FiCaSchedulerNode node,
+ boolean withNodeHeartbeat, RMContainer reservedContainer) {
+ FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer(
+ reservedContainer.getContainerId());
+ if (reservedApplication == null) {
+ LOG.error(
+ "Trying to schedule for a finished app, please double check. nodeId="
+ + node.getNodeID() + " container=" + reservedContainer
+ .getContainerId());
+ return;
+ }
+
+ // Try to fulfill the reservation
+ LOG.debug("Trying to fulfill reservation for application {} on node: {}",
+ reservedApplication.getApplicationId(), node.getNodeID());
+
+ LeafQueue queue = ((LeafQueue) reservedApplication.getQueue());
+ CSAssignment assignment = queue.assignContainers(getClusterResource(),
+ new SimpleCandidateNodeSet<>(node),
+ // TODO, now we only consider limits for parent for non-labeled
+ // resources, should consider labeled resources as well.
+ new ResourceLimits(labelManager
+ .getResourceByLabel(RMNodeLabelsManager.NO_LABEL,
+ getClusterResource())),
+ SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
+
+ if (assignment.isFulfilledReservation()) {
+ if (withNodeHeartbeat) {
+ // Only update SchedulerHealth in sync scheduling, existing
+ // Data structure of SchedulerHealth need to be updated for
+ // Async mode
+ updateSchedulerHealth(lastNodeUpdateTime, node.getNodeID(),
+ assignment);
+ }
+
+ schedulerHealth.updateSchedulerFulfilledReservationCounts(1);
+
+ ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
+ queue.getParent().getQueuePath(), queue.getQueuePath(),
+ ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY);
+ ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager,
+ node, reservedContainer.getContainerId(),
+ AllocationState.ALLOCATED_FROM_RESERVED);
+ } else if (assignment.getAssignmentInformation().getNumReservations() > 0) {
+ ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
+ queue.getParent().getQueuePath(), queue.getQueuePath(),
+ ActivityState.RE_RESERVED, ActivityDiagnosticConstant.EMPTY);
+ ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager,
+ node, reservedContainer.getContainerId(), AllocationState.RESERVED);
+ }
+
+ assignment.setSchedulingMode(
+ SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
+ submitResourceCommitRequest(getClusterResource(), assignment);
+ }
+
+ private CSAssignment allocateOrReserveNewContainers(
+ CandidateNodeSet candidates,
+ boolean withNodeHeartbeat) {
+ CSAssignment assignment = getRootQueue().assignContainers(
+ getClusterResource(), candidates, new ResourceLimits(labelManager
+ .getResourceByLabel(candidates.getPartition(),
+ getClusterResource())),
+ SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
+
+ assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
+ submitResourceCommitRequest(getClusterResource(), assignment);
+
+ if (Resources.greaterThan(calculator, getClusterResource(),
+ assignment.getResource(), Resources.none())) {
+ FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
+ NodeId nodeId = null;
+ if (node != null) {
+ nodeId = node.getNodeID();
+ }
+ if (withNodeHeartbeat) {
+ updateSchedulerHealth(lastNodeUpdateTime, nodeId, assignment);
+ }
+ return assignment;
+ }
+
+ // Only do non-exclusive allocation when node has node-labels.
+ if (StringUtils.equals(candidates.getPartition(),
+ RMNodeLabelsManager.NO_LABEL)) {
+ return null;
+ }
+
+ // Only do non-exclusive allocation when the node-label supports that
+ try {
+ if (rmContext.getNodeLabelManager().isExclusiveNodeLabel(
+ candidates.getPartition())) {
+ return null;
+ }
+ } catch (IOException e) {
+ LOG.warn(
+ "Exception when trying to get exclusivity of node label=" + candidates
+ .getPartition(), e);
+ return null;
+ }
+
+ // Try to use NON_EXCLUSIVE
+ assignment = getRootQueue().assignContainers(getClusterResource(),
+ candidates,
+ // TODO, now we only consider limits for parent for non-labeled
+ // resources, should consider labeled resources as well.
+ new ResourceLimits(labelManager
+ .getResourceByLabel(RMNodeLabelsManager.NO_LABEL,
+ getClusterResource())),
+ SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
+ assignment.setSchedulingMode(SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
+ submitResourceCommitRequest(getClusterResource(), assignment);
+
+ return assignment;
+ }
+
+ /*
+ * New behavior, allocate containers considering multiple nodes
+ */
+ private CSAssignment allocateContainersOnMultiNodes(
+ CandidateNodeSet candidates) {
+ // When this time look at multiple nodes, try schedule if the
+ // partition has any available resource or killable resource
+ if (getRootQueue().getQueueCapacities().getUsedCapacity(
+ candidates.getPartition()) >= 1.0f
+ && preemptionManager.getKillableResource(
+ CapacitySchedulerConfiguration.ROOT, candidates.getPartition())
+ == Resources.none()) {
+ // Try to allocate from reserved containers
+ for (FiCaSchedulerNode node : candidates.getAllNodes().values()) {
+ RMContainer reservedContainer = node.getReservedContainer();
+ if (reservedContainer != null) {
+ allocateFromReservedContainer(node, false, reservedContainer);
+ }
+ }
+ LOG.debug("This partition '{}' doesn't have available or "
+ + "killable resource", candidates.getPartition());
+ ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, null,
+ "", getRootQueue().getQueuePath(), ActivityState.REJECTED,
+ ActivityDiagnosticConstant.
+ INIT_CHECK_PARTITION_RESOURCE_INSUFFICIENT);
+ ActivitiesLogger.NODE
+ .finishSkippedNodeAllocation(activitiesManager, null);
+ return null;
+ }
+
+ return allocateOrReserveNewContainers(candidates, false);
+ }
+
+ @VisibleForTesting
+ CSAssignment allocateContainersToNode(
+ CandidateNodeSet candidates,
+ boolean withNodeHeartbeat) {
+ if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext
+ .isSchedulerReadyForAllocatingContainers()) {
+ return null;
+ }
+
+ long startTime = System.nanoTime();
+
+ // Backward compatible way to make sure previous behavior which allocation
+ // driven by node heartbeat works.
+ FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
+
+ // We have two different logics to handle allocation on single node / multi
+ // nodes.
+ CSAssignment assignment;
+ if (!multiNodePlacementEnabled) {
+ ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager,
+ node.getNodeID());
+ assignment = allocateContainerOnSingleNode(candidates,
+ node, withNodeHeartbeat);
+ ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager,
+ node.getNodeID(), candidates.getPartition());
+ } else {
+ ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager,
+ ActivitiesManager.EMPTY_NODE_ID);
+ assignment = allocateContainersOnMultiNodes(candidates);
+ ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager,
+ ActivitiesManager.EMPTY_NODE_ID, candidates.getPartition());
+ }
+
+ if (assignment != null && assignment.getAssignmentInformation() != null
+ && assignment.getAssignmentInformation().getNumAllocations() > 0) {
+ long allocateTime = System.nanoTime() - startTime;
+ CapacitySchedulerMetrics.getMetrics().addAllocate(allocateTime);
+ }
+ return assignment;
+ }
+
+ @Override
+ public void handle(SchedulerEvent event) {
+ switch (event.getType()) {
+ case NODE_ADDED: {
+ NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent) event;
+ addNode(nodeAddedEvent.getAddedRMNode());
+ recoverContainersOnNode(nodeAddedEvent.getContainerReports(),
+ nodeAddedEvent.getAddedRMNode());
+ }
+ break;
+ case NODE_REMOVED: {
+ NodeRemovedSchedulerEvent nodeRemovedEvent = (NodeRemovedSchedulerEvent) event;
+ removeNode(nodeRemovedEvent.getRemovedRMNode());
+ }
+ break;
+ case NODE_RESOURCE_UPDATE: {
+ NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent =
+ (NodeResourceUpdateSchedulerEvent) event;
+ updateNodeAndQueueResource(nodeResourceUpdatedEvent.getRMNode(),
+ nodeResourceUpdatedEvent.getResourceOption());
+ }
+ break;
+ case NODE_LABELS_UPDATE: {
+ NodeLabelsUpdateSchedulerEvent labelUpdateEvent =
+ (NodeLabelsUpdateSchedulerEvent) event;
+
+ updateNodeLabelsAndQueueResource(labelUpdateEvent);
+ }
+ break;
+ case NODE_ATTRIBUTES_UPDATE: {
+ NodeAttributesUpdateSchedulerEvent attributeUpdateEvent =
+ (NodeAttributesUpdateSchedulerEvent) event;
+
+ updateNodeAttributes(attributeUpdateEvent);
+ }
+ break;
+ case NODE_UPDATE: {
+ NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent) event;
+ nodeUpdate(nodeUpdatedEvent.getRMNode());
+ }
+ break;
+ case APP_ADDED: {
+ AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event;
+ String queueName = resolveReservationQueueName(appAddedEvent.getQueue(),
+ appAddedEvent.getApplicationId(), appAddedEvent.getReservationID(),
+ appAddedEvent.getIsAppRecovering());
+ if (queueName != null) {
+ if (!appAddedEvent.getIsAppRecovering()) {
+ addApplication(appAddedEvent.getApplicationId(), queueName,
+ appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(),
+ appAddedEvent.getPlacementContext());
+ } else {
+ addApplicationOnRecovery(appAddedEvent.getApplicationId(), queueName,
+ appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(),
+ appAddedEvent.getPlacementContext());
+ }
+ }
+ }
+ break;
+ case APP_REMOVED: {
+ AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent) event;
+ doneApplication(appRemovedEvent.getApplicationID(),
+ appRemovedEvent.getFinalState());
+ }
+ break;
+ case APP_ATTEMPT_ADDED: {
+ AppAttemptAddedSchedulerEvent appAttemptAddedEvent =
+ (AppAttemptAddedSchedulerEvent) event;
+ addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(),
+ appAttemptAddedEvent.getTransferStateFromPreviousAttempt(),
+ appAttemptAddedEvent.getIsAttemptRecovering());
+ }
+ break;
+ case APP_ATTEMPT_REMOVED: {
+ AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent =
+ (AppAttemptRemovedSchedulerEvent) event;
+ doneApplicationAttempt(appAttemptRemovedEvent.getApplicationAttemptID(),
+ appAttemptRemovedEvent.getFinalAttemptState(),
+ appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts());
+ }
+ break;
+ case CONTAINER_EXPIRED: {
+ ContainerExpiredSchedulerEvent containerExpiredEvent =
+ (ContainerExpiredSchedulerEvent) event;
+ ContainerId containerId = containerExpiredEvent.getContainerId();
+ if (containerExpiredEvent.isIncrease()) {
+ rollbackContainerUpdate(containerId);
+ } else {
+ completedContainer(getRMContainer(containerId),
+ SchedulerUtils.createAbnormalContainerStatus(
+ containerId,
+ SchedulerUtils.EXPIRED_CONTAINER),
+ RMContainerEventType.EXPIRE);
+ }
+ }
+ break;
+ case RELEASE_CONTAINER: {
+ RMContainer container = ((ReleaseContainerEvent) event).getContainer();
+ completedContainer(container,
+ SchedulerUtils.createAbnormalContainerStatus(
+ container.getContainerId(),
+ SchedulerUtils.RELEASED_CONTAINER),
+ RMContainerEventType.RELEASED);
+ }
+ break;
+ case KILL_RESERVED_CONTAINER: {
+ ContainerPreemptEvent killReservedContainerEvent =
+ (ContainerPreemptEvent) event;
+ RMContainer container = killReservedContainerEvent.getContainer();
+ killReservedContainer(container);
+ }
+ break;
+ case MARK_CONTAINER_FOR_PREEMPTION: {
+ ContainerPreemptEvent preemptContainerEvent =
+ (ContainerPreemptEvent) event;
+ ApplicationAttemptId aid = preemptContainerEvent.getAppId();
+ RMContainer containerToBePreempted = preemptContainerEvent.getContainer();
+ markContainerForPreemption(aid, containerToBePreempted);
+ }
+ break;
+ case MARK_CONTAINER_FOR_KILLABLE: {
+ ContainerPreemptEvent containerKillableEvent = (ContainerPreemptEvent) event;
+ RMContainer killableContainer = containerKillableEvent.getContainer();
+ markContainerForKillable(killableContainer);
+ }
+ break;
+ case MARK_CONTAINER_FOR_NONKILLABLE: {
+ if (isLazyPreemptionEnabled) {
+ ContainerPreemptEvent cancelKillContainerEvent =
+ (ContainerPreemptEvent) event;
+ markContainerForNonKillable(cancelKillContainerEvent.getContainer());
+ }
+ }
+ break;
+ case MANAGE_QUEUE: {
+ QueueManagementChangeEvent queueManagementChangeEvent =
+ (QueueManagementChangeEvent) event;
+ ParentQueue parentQueue = queueManagementChangeEvent.getParentQueue();
+ try {
+ final List queueManagementChanges =
+ queueManagementChangeEvent.getQueueManagementChanges();
+ ((ManagedParentQueue) parentQueue)
+ .validateAndApplyQueueManagementChanges(queueManagementChanges);
+ } catch (SchedulerDynamicEditException sde) {
+ LOG.error("Queue Management Change event cannot be applied for "
+ + "parent queue : " + parentQueue.getQueuePath(), sde);
+ } catch (IOException ioe) {
+ LOG.error("Queue Management Change event cannot be applied for "
+ + "parent queue : " + parentQueue.getQueuePath(), ioe);
+ }
+ }
+ break;
+ default:
+ LOG.error("Invalid eventtype " + event.getType() + ". Ignoring!");
+ }
+ }
+
+ private void updateNodeAttributes(
+ NodeAttributesUpdateSchedulerEvent attributeUpdateEvent) {
+ writeLock.lock();
+ try {
+ for (Entry> entry : attributeUpdateEvent
+ .getUpdatedNodeToAttributes().entrySet()) {
+ String hostname = entry.getKey();
+ Set attributes = entry.getValue();
+ List nodeIds = nodeTracker.getNodeIdsByResourceName(hostname);
+ updateAttributesOnNode(nodeIds, attributes);
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void updateAttributesOnNode(List nodeIds,
+ Set attributes) {
+ nodeIds.forEach((k) -> {
+ SchedulerNode node = nodeTracker.getNode(k);
+ node.updateNodeAttributes(attributes);
+ });
+ }
+
+ /**
+ * Process node labels update.
+ */
+ private void updateNodeLabelsAndQueueResource(
+ NodeLabelsUpdateSchedulerEvent labelUpdateEvent) {
+ writeLock.lock();
+ try {
+ Set updateLabels = new HashSet();
+ for (Entry> entry : labelUpdateEvent
+ .getUpdatedNodeToLabels().entrySet()) {
+ NodeId id = entry.getKey();
+ Set labels = entry.getValue();
+ FiCaSchedulerNode node = nodeTracker.getNode(id);
+
+ if (node != null) {
+ // Update old partition to list.
+ updateLabels.add(node.getPartition());
+ }
+ updateLabelsOnNode(id, labels);
+ updateLabels.addAll(labels);
+ }
+ refreshLabelToNodeCache(updateLabels);
+ Resource clusterResource = getClusterResource();
+ getRootQueue().updateClusterResource(clusterResource,
+ new ResourceLimits(clusterResource));
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void refreshLabelToNodeCache(Set updateLabels) {
+ Map> labelMapping = labelManager
+ .getLabelsToNodes(updateLabels);
+ for (String label : updateLabels) {
+ Set nodes = labelMapping.get(label);
+ if (nodes == null) {
+ continue;
+ }
+ nodeTracker.updateNodesPerPartition(label, nodes);
+ }
+ }
+
+ /**
+ * Add node to nodeTracker. Used when validating CS configuration by instantiating a new
+ * CS instance.
+ * @param nodesToAdd node to be added
+ */
+ public void addNodes(List nodesToAdd) {
+ writeLock.lock();
+ try {
+ for (FiCaSchedulerNode node : nodesToAdd) {
+ nodeTracker.addNode(node);
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void addNode(RMNode nodeManager) {
+ writeLock.lock();
+ try {
+ FiCaSchedulerNode schedulerNode = new FiCaSchedulerNode(nodeManager,
+ usePortForNodeName, nodeManager.getNodeLabels());
+ nodeTracker.addNode(schedulerNode);
+
+ // update this node to node label manager
+ if (labelManager != null) {
+ labelManager.activateNode(nodeManager.getNodeID(),
+ schedulerNode.getTotalResource());
+ }
+
+ // recover attributes from store if any.
+ if (rmContext.getNodeAttributesManager() != null) {
+ rmContext.getNodeAttributesManager()
+ .refreshNodeAttributesToScheduler(schedulerNode.getNodeID());
+ }
+
+ Resource clusterResource = getClusterResource();
+ getRootQueue().updateClusterResource(clusterResource,
+ new ResourceLimits(clusterResource));
+
+ LOG.info(
+ "Added node " + nodeManager.getNodeAddress() + " clusterResource: "
+ + clusterResource);
+
+ if (scheduleAsynchronously && getNumClusterNodes() == 1) {
+ for (AsyncScheduleThread t : asyncSchedulerThreads) {
+ t.beginSchedule();
+ }
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void removeNode(RMNode nodeInfo) {
+ writeLock.lock();
+ try {
+ // update this node to node label manager
+ if (labelManager != null) {
+ labelManager.deactivateNode(nodeInfo.getNodeID());
+ }
+
+ NodeId nodeId = nodeInfo.getNodeID();
+ FiCaSchedulerNode node = nodeTracker.getNode(nodeId);
+ if (node == null) {
+ LOG.error("Attempting to remove non-existent node " + nodeId);
+ return;
+ }
+
+ // Remove running containers
+ List runningContainers =
+ node.getCopiedListOfRunningContainers();
+ for (RMContainer container : runningContainers) {
+ super.completedContainer(container, SchedulerUtils
+ .createAbnormalContainerStatus(container.getContainerId(),
+ SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL);
+ node.releaseContainer(container.getContainerId(), true);
+ }
+
+ // Remove reservations, if any
+ RMContainer reservedContainer = node.getReservedContainer();
+ if (reservedContainer != null) {
+ super.completedContainer(reservedContainer, SchedulerUtils
+ .createAbnormalContainerStatus(reservedContainer.getContainerId(),
+ SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL);
+ }
+
+ nodeTracker.removeNode(nodeId);
+ Resource clusterResource = getClusterResource();
+ getRootQueue().updateClusterResource(clusterResource,
+ new ResourceLimits(clusterResource));
+ int numNodes = nodeTracker.nodeCount();
+
+ if (scheduleAsynchronously && numNodes == 0) {
+ for (AsyncScheduleThread t : asyncSchedulerThreads) {
+ t.suspendSchedule();
+ }
+ }
+
+ LOG.info(
+ "Removed node " + nodeInfo.getNodeAddress() + " clusterResource: "
+ + getClusterResource());
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ protected void completedContainerInternal(
+ RMContainer rmContainer, ContainerStatus containerStatus,
+ RMContainerEventType event) {
+ Container container = rmContainer.getContainer();
+ ContainerId containerId = container.getId();
+
+ // Get the application for the finished container
+ FiCaSchedulerApp application = getCurrentAttemptForContainer(
+ container.getId());
+ ApplicationId appId =
+ containerId.getApplicationAttemptId().getApplicationId();
+ if (application == null) {
+ LOG.info(
+ "Container " + container + " of" + " finished application " + appId
+ + " completed with event " + event);
+ return;
+ }
+
+ // Get the node on which the container was allocated
+ FiCaSchedulerNode node = getNode(container.getNodeId());
+ if (null == node) {
+ LOG.info("Container " + container + " of" + " removed node " + container
+ .getNodeId() + " completed with event " + event);
+ return;
+ }
+
+ // Inform the queue
+ LeafQueue queue = (LeafQueue) application.getQueue();
+ queue.completedContainer(getClusterResource(), application, node,
+ rmContainer, containerStatus, event, null, true);
+ }
+
+ @Lock(Lock.NoLock.class)
+ @VisibleForTesting
+ @Override
+ public FiCaSchedulerApp getApplicationAttempt(
+ ApplicationAttemptId applicationAttemptId) {
+ return super.getApplicationAttempt(applicationAttemptId);
+ }
+
+ @Lock(Lock.NoLock.class)
+ public FiCaSchedulerNode getNode(NodeId nodeId) {
+ return nodeTracker.getNode(nodeId);
+ }
+
+ @Lock(Lock.NoLock.class)
+ public List getAllNodes() {
+ return nodeTracker.getAllNodes();
+ }
+
+ @Override
+ @Lock(Lock.NoLock.class)
+ public void recover(RMState state) throws Exception {
+ // NOT IMPLEMENTED
+ }
+
+ @Override
+ public void killReservedContainer(RMContainer container) {
+ LOG.debug("{}:{}", SchedulerEventType.KILL_RESERVED_CONTAINER, container);
+
+ // To think: What happens if this is no longer a reserved container, for
+ // e.g if the reservation became an allocation.
+ super.completedContainer(container,
+ SchedulerUtils.createAbnormalContainerStatus(
+ container.getContainerId(),
+ SchedulerUtils.UNRESERVED_CONTAINER),
+ RMContainerEventType.KILL);
+ }
+
+ @Override
+ public void markContainerForPreemption(ApplicationAttemptId aid,
+ RMContainer cont) {
+ LOG.debug("{}: appAttempt:{} container:{}",
+ SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION, aid, cont);
+ FiCaSchedulerApp app = getApplicationAttempt(aid);
+ if (app != null) {
+ app.markContainerForPreemption(cont.getContainerId());
+ }
+ }
+
+ @VisibleForTesting
+ @Override
+ public void killContainer(RMContainer container) {
+ markContainerForKillable(container);
+ }
+
+ public void markContainerForKillable(
+ RMContainer killableContainer) {
+ writeLock.lock();
+ try {
+ LOG.debug("{}: container {}",
+ SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE, killableContainer);
+
+ if (!isLazyPreemptionEnabled) {
+ super.completedContainer(killableContainer, SchedulerUtils
+ .createPreemptedContainerStatus(killableContainer.getContainerId(),
+ SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL);
+ } else {
+ FiCaSchedulerNode node = getSchedulerNode(
+ killableContainer.getAllocatedNode());
+
+ FiCaSchedulerApp application = getCurrentAttemptForContainer(
+ killableContainer.getContainerId());
+
+ node.markContainerToKillable(killableContainer.getContainerId());
+
+ // notify PreemptionManager
+ // Get the application for the finished container
+ if (null != application) {
+ String leafQueuePath = application.getCSLeafQueue().getQueuePath();
+ getPreemptionManager().addKillableContainer(
+ new KillableContainer(killableContainer, node.getPartition(),
+ leafQueuePath));
+ }
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ private void markContainerForNonKillable(
+ RMContainer nonKillableContainer) {
+ writeLock.lock();
+ try {
+ LOG.debug("{}: container {}", SchedulerEventType.
+ MARK_CONTAINER_FOR_NONKILLABLE, nonKillableContainer);
+
+ FiCaSchedulerNode node = getSchedulerNode(
+ nonKillableContainer.getAllocatedNode());
+
+ FiCaSchedulerApp application = getCurrentAttemptForContainer(
+ nonKillableContainer.getContainerId());
+
+ node.markContainerToNonKillable(nonKillableContainer.getContainerId());
+
+ // notify PreemptionManager
+ // Get the application for the finished container
+ if (null != application) {
+ String leafQueuePath = application.getCSLeafQueue().getQueuePath();
+ getPreemptionManager().removeKillableContainer(
+ new KillableContainer(nonKillableContainer, node.getPartition(),
+ leafQueuePath));
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public boolean checkAccess(UserGroupInformation callerUGI,
+ QueueACL acl, String queueName) {
+ CSQueue queue = getQueue(queueName);
+
+ if (queueName.startsWith("root.")) {
+ // can only check proper ACLs if the path is fully qualified
+ while (queue == null) {
+ int sepIndex = queueName.lastIndexOf(".");
+ String parentName = queueName.substring(0, sepIndex);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Queue {} does not exist, checking parent {}",
+ queueName, parentName);
+ }
+ queueName = parentName;
+ queue = queueManager.getQueue(queueName);
+ }
+ }
+
+ if (queue == null) {
+ LOG.debug("ACL not found for queue access-type {} for queue {}",
+ acl, queueName);
+ return false;
+ }
+ return queue.hasAccess(acl, callerUGI);
+ }
+
+ @Override
+ public List getAppsInQueue(String queueName) {
+ CSQueue queue = getQueue(queueName);
+ if (queue == null) {
+ return null;
+ }
+ List apps = new ArrayList();
+ queue.collectSchedulerApplications(apps);
+ return apps;
+ }
+
+ public boolean isSystemAppsLimitReached() {
+ if (getRootQueue().getNumApplications() < conf
+ .getMaximumSystemApplications()) {
+ return false;
+ }
+ return true;
+ }
+
+ private String getDefaultReservationQueueName(String planQueueName) {
+ return planQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX;
+ }
+
+ private String resolveReservationQueueName(String queueName,
+ ApplicationId applicationId, ReservationId reservationID,
+ boolean isRecovering) {
+ readLock.lock();
+ try {
+ CSQueue queue = getQueue(queueName);
+ // Check if the queue is a plan queue
+ if ((queue == null) || !(queue instanceof PlanQueue)) {
+ return queueName;
+ }
+ if (reservationID != null) {
+ String resQName = reservationID.toString();
+ queue = getQueue(resQName);
+ if (queue == null) {
+ // reservation has terminated during failover
+ if (isRecovering && conf.getMoveOnExpiry(
+ getQueue(queueName).getQueuePath())) {
+ // move to the default child queue of the plan
+ return getDefaultReservationQueueName(queueName);
+ }
+ String message = "Application " + applicationId
+ + " submitted to a reservation which is not currently active: "
+ + resQName;
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return null;
+ }
+ if (!queue.getParent().getQueuePath().equals(queueName)) {
+ String message =
+ "Application: " + applicationId + " submitted to a reservation "
+ + resQName + " which does not belong to the specified queue: "
+ + queueName;
+ this.rmContext.getDispatcher().getEventHandler().handle(
+ new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED,
+ message));
+ return null;
+ }
+ // use the reservation queue to run the app
+ queueName = resQName;
+ } else {
+ // use the default child queue of the plan for unreserved apps
+ queueName = getDefaultReservationQueueName(queueName);
+ }
+ return queueName;
+ } finally {
+ readLock.unlock();
+ }
+
+ }
+
+ @Override
+ public void removeQueue(String queueName)
+ throws SchedulerDynamicEditException {
+ writeLock.lock();
+ try {
+ LOG.info("Removing queue: " + queueName);
+ CSQueue q = this.getQueue(queueName);
+ if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom(
+ q.getClass()))) {
+ throw new SchedulerDynamicEditException(
+ "The queue that we are asked " + "to remove (" + queueName
+ + ") is not a AutoCreatedLeafQueue or ReservationQueue");
+ }
+ AbstractAutoCreatedLeafQueue disposableLeafQueue =
+ (AbstractAutoCreatedLeafQueue) q;
+ // at this point we should have no more apps
+ if (disposableLeafQueue.getNumApplications() > 0) {
+ throw new SchedulerDynamicEditException(
+ "The queue " + queueName + " is not empty " + disposableLeafQueue
+ .getApplications().size() + " active apps "
+ + disposableLeafQueue.getPendingApplications().size()
+ + " pending apps");
+ }
+
+ ((AbstractManagedParentQueue) disposableLeafQueue.getParent())
+ .removeChildQueue(q);
+ this.queueManager.removeQueue(queueName);
+ LOG.info(
+ "Removal of AutoCreatedLeafQueue " + queueName + " has succeeded");
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public void addQueue(Queue queue)
+ throws SchedulerDynamicEditException, IOException {
+ writeLock.lock();
+ try {
+ if (queue == null) {
+ throw new SchedulerDynamicEditException(
+ "Queue specified is null. Should be an implementation of "
+ + "AbstractAutoCreatedLeafQueue");
+ } else if (!(AbstractAutoCreatedLeafQueue.class
+ .isAssignableFrom(queue.getClass()))) {
+ throw new SchedulerDynamicEditException(
+ "Queue is not an implementation of "
+ + "AbstractAutoCreatedLeafQueue : " + queue.getClass());
+ }
+
+ AbstractAutoCreatedLeafQueue newQueue =
+ (AbstractAutoCreatedLeafQueue) queue;
+
+ if (newQueue.getParent() == null || !(AbstractManagedParentQueue.class.
+ isAssignableFrom(newQueue.getParent().getClass()))) {
+ throw new SchedulerDynamicEditException(
+ "ParentQueue for " + newQueue + " is not properly set"
+ + " (should be set and be a PlanQueue or ManagedParentQueue)");
+ }
+
+ AbstractManagedParentQueue parent =
+ (AbstractManagedParentQueue) newQueue.getParent();
+ String queuePath = newQueue.getQueuePath();
+ parent.addChildQueue(newQueue);
+ this.queueManager.addQueue(queuePath, newQueue);
+
+ LOG.info("Creation of AutoCreatedLeafQueue " + newQueue + " succeeded");
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public void setEntitlement(String inQueue, QueueEntitlement entitlement)
+ throws YarnException {
+ writeLock.lock();
+ try {
+ LeafQueue queue = this.queueManager.getAndCheckLeafQueue(inQueue);
+ AbstractManagedParentQueue parent =
+ (AbstractManagedParentQueue) queue.getParent();
+
+ if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom(
+ queue.getClass()))) {
+ throw new SchedulerDynamicEditException(
+ "Entitlement can not be" + " modified dynamically since queue "
+ + inQueue + " is not a AutoCreatedLeafQueue");
+ }
+
+ if (parent == null || !(AbstractManagedParentQueue.class.isAssignableFrom(
+ parent.getClass()))) {
+ throw new SchedulerDynamicEditException(
+ "The parent of AutoCreatedLeafQueue " + inQueue
+ + " must be a PlanQueue/ManagedParentQueue");
+ }
+
+ AbstractAutoCreatedLeafQueue newQueue =
+ (AbstractAutoCreatedLeafQueue) queue;
+ parent.validateQueueEntitlementChange(newQueue, entitlement);
+
+ newQueue.setEntitlement(entitlement);
+
+ LOG.info("Set entitlement for AutoCreatedLeafQueue " + inQueue + " to "
+ + queue.getCapacity() + " request was (" + entitlement.getCapacity()
+ + ")");
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public String moveApplication(ApplicationId appId,
+ String targetQueueName) throws YarnException {
+ writeLock.lock();
+ try {
+ SchedulerApplication application =
+ applications.get(appId);
+ if (application == null) {
+ throw new YarnException("App to be moved " + appId + " not found.");
+ }
+ String sourceQueueName = application.getQueue().getQueueName();
+ LeafQueue source =
+ this.queueManager.getAndCheckLeafQueue(sourceQueueName);
+ String destQueueName = handleMoveToPlanQueue(targetQueueName);
+ LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
+
+ String user = application.getUser();
+ try {
+ dest.submitApplication(appId, user, destQueueName);
+ } catch (AccessControlException e) {
+ throw new YarnException(e);
+ }
+
+ FiCaSchedulerApp app = application.getCurrentAppAttempt();
+ if (app != null) {
+ // Move all live containers even when stopped.
+ // For transferStateFromPreviousAttempt required
+ for (RMContainer rmContainer : app.getLiveContainers()) {
+ source.detachContainer(getClusterResource(), app, rmContainer);
+ // attach the Container to another queue
+ dest.attachContainer(getClusterResource(), app, rmContainer);
+ }
+ // Move all reserved containers
+ for (RMContainer rmContainer : app.getReservedContainers()) {
+ source.detachContainer(getClusterResource(), app, rmContainer);
+ dest.attachContainer(getClusterResource(), app, rmContainer);
+ }
+ if (!app.isStopped()) {
+ source.finishApplicationAttempt(app, sourceQueueName);
+ // Submit to a new queue
+ dest.submitApplicationAttempt(app, user, true);
+ }
+ // Finish app & update metrics
+ app.move(dest);
+ }
+ source.appFinished();
+ // Detach the application..
+ source.getParent().finishApplication(appId, user);
+ application.setQueue(dest);
+ LOG.info("App: " + appId + " successfully moved from " + sourceQueueName
+ + " to: " + destQueueName);
+ return targetQueueName;
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public void preValidateMoveApplication(ApplicationId appId,
+ String newQueue) throws YarnException {
+ writeLock.lock();
+ try {
+ SchedulerApplication application =
+ applications.get(appId);
+ if (application == null) {
+ throw new YarnException("App to be moved " + appId + " not found.");
+ }
+ Queue queue = application.getQueue();
+ String sourceQueueName = queue instanceof CSQueue ?
+ ((CSQueue) queue).getQueuePath() : queue.getQueueName();
+ this.queueManager.getAndCheckLeafQueue(sourceQueueName);
+ String destQueueName = handleMoveToPlanQueue(newQueue);
+ LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
+ // Validation check - ACLs, submission limits for user & queue
+ String user = application.getUser();
+ // Check active partition only when attempt is available
+ FiCaSchedulerApp appAttempt =
+ getApplicationAttempt(ApplicationAttemptId.newInstance(appId, 0));
+ if (null != appAttempt) {
+ checkQueuePartition(appAttempt, dest);
+ }
+ try {
+ dest.validateSubmitApplication(appId, user, destQueueName);
+ } catch (AccessControlException e) {
+ throw new YarnException(e);
+ }
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ /**
+ * Check application can be moved to queue with labels enabled. All labels in
+ * application life time will be checked
+ *
+ * @param app
+ * @param dest
+ * @throws YarnException
+ */
+ private void checkQueuePartition(FiCaSchedulerApp app, LeafQueue dest)
+ throws YarnException {
+ if (!YarnConfiguration.areNodeLabelsEnabled(conf)) {
+ return;
+ }
+ Set targetqueuelabels = dest.getAccessibleNodeLabels();
+ AppSchedulingInfo schedulingInfo = app.getAppSchedulingInfo();
+ Set appLabelexpressions = schedulingInfo.getRequestedPartitions();
+ // default partition access always available remove empty label
+ appLabelexpressions.remove(RMNodeLabelsManager.NO_LABEL);
+ Set nonAccessiblelabels = new HashSet();
+ for (String label : appLabelexpressions) {
+ if (!SchedulerUtils.checkQueueLabelExpression(targetqueuelabels, label,
+ null)) {
+ nonAccessiblelabels.add(label);
+ }
+ }
+ if (nonAccessiblelabels.size() > 0) {
+ throw new YarnException(
+ "Specified queue=" + dest.getQueuePath() + " can't satisfy following "
+ + "apps label expressions =" + nonAccessiblelabels
+ + " accessible node labels =" + targetqueuelabels);
+ }
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public EnumSet getSchedulingResourceTypes() {
+ if (calculator.getClass().getName()
+ .equals(DefaultResourceCalculator.class.getName())) {
+ return EnumSet.of(SchedulerResourceTypes.MEMORY);
+ }
+ return EnumSet.of(SchedulerResourceTypes.MEMORY, SchedulerResourceTypes.CPU);
+ }
+
+ @Override
+ public Resource getMaximumResourceCapability(String queueName) {
+ if (queueName == null || queueName.isEmpty()) {
+ return getMaximumResourceCapability();
+ }
+ CSQueue queue = getQueue(queueName);
+ if (queue == null) {
+ if (isAmbiguous(queueName)) {
+ LOG.error("Ambiguous queue reference: " + queueName
+ + " please use full queue path instead.");
+ } else {
+ LOG.error("Unknown queue: " + queueName);
+ }
+ return getMaximumResourceCapability();
+ }
+ if (!(queue instanceof LeafQueue)) {
+ LOG.error("queue " + queueName + " is not an leaf queue");
+ return getMaximumResourceCapability();
+ }
+
+ // queue.getMaxAllocation returns *configured* maximum allocation.
+ // getMaximumResourceCapability() returns maximum allocation considers
+ // per-node maximum resources. So return (component-wise) min of the two.
+
+ Resource queueMaxAllocation = ((LeafQueue) queue).getMaximumAllocation();
+ Resource clusterMaxAllocationConsiderNodeMax =
+ getMaximumResourceCapability();
+
+ return Resources.componentwiseMin(queueMaxAllocation,
+ clusterMaxAllocationConsiderNodeMax);
+ }
+
+ private String handleMoveToPlanQueue(String targetQueueName) {
+ CSQueue dest = getQueue(targetQueueName);
+ if (dest != null && dest instanceof PlanQueue) {
+ // use the default child reservation queue of the plan
+ targetQueueName = targetQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX;
+ }
+ return targetQueueName;
+ }
+
+ @Override
+ public Set getPlanQueues() {
+ Set ret = new HashSet();
+ for (Entry l : queueManager.getQueues().entrySet()) {
+ if (l.getValue() instanceof PlanQueue) {
+ ret.add(l.getKey());
+ }
+ }
+ return ret;
+ }
+
+ @Override
+ public Priority checkAndGetApplicationPriority(
+ Priority priorityRequestedByApp, UserGroupInformation user,
+ String queuePath, ApplicationId applicationId) throws YarnException {
+ readLock.lock();
+ try {
+ Priority appPriority = priorityRequestedByApp;
+
+ // Verify the scenario where priority is null from submissionContext.
+ if (null == appPriority) {
+ // Verify whether submitted user has any default priority set. If so,
+ // user's default priority will get precedence over queue default.
+ // for updateApplicationPriority call flow, this check is done in
+ // CientRMService itself.
+ appPriority = this.appPriorityACLManager.getDefaultPriority(
+ normalizeQueueName(queuePath),
+ user);
+
+ // Get the default priority for the Queue. If Queue is non-existent,
+ // then
+ // use default priority. Do it only if user doesn't have any default.
+ if (null == appPriority) {
+ appPriority = this.queueManager.getDefaultPriorityForQueue(
+ normalizeQueueName(queuePath));
+ }
+
+ LOG.info(
+ "Application '" + applicationId + "' is submitted without priority "
+ + "hence considering default queue/cluster priority: "
+ + appPriority.getPriority());
+ }
+
+ // Verify whether submitted priority is lesser than max priority
+ // in the cluster. If it is out of found, defining a max cap.
+ if (appPriority.getPriority() > getMaxClusterLevelAppPriority()
+ .getPriority()) {
+ appPriority = Priority
+ .newInstance(getMaxClusterLevelAppPriority().getPriority());
+ }
+
+ // Lets check for ACLs here.
+ if (!appPriorityACLManager.checkAccess(user, normalizeQueueName(queuePath), appPriority)) {
+ throw new YarnException(new AccessControlException(
+ "User " + user + " does not have permission to submit/update "
+ + applicationId + " for " + appPriority));
+ }
+
+ LOG.info("Priority '" + appPriority.getPriority()
+ + "' is acceptable in queue : " + queuePath + " for application: "
+ + applicationId);
+
+ return appPriority;
+ } finally {
+ readLock.unlock();
+ }
+ }
+
+ @Override
+ public Priority updateApplicationPriority(Priority newPriority,
+ ApplicationId applicationId, SettableFuture future,
+ UserGroupInformation user)
+ throws YarnException {
+ writeLock.lock();
+ try {
+ Priority appPriority = null;
+ SchedulerApplication application = applications
+ .get(applicationId);
+
+ if (application == null) {
+ throw new YarnException("Application '" + applicationId
+ + "' is not present, hence could not change priority.");
+ }
+
+ RMApp rmApp = rmContext.getRMApps().get(applicationId);
+
+ appPriority = checkAndGetApplicationPriority(newPriority, user,
+ rmApp.getQueue(), applicationId);
+
+ if (application.getPriority().equals(appPriority)) {
+ future.set(null);
+ return appPriority;
+ }
+
+ // Update new priority in Submission Context to update to StateStore.
+ rmApp.getApplicationSubmissionContext().setPriority(appPriority);
+
+ // Update to state store
+ ApplicationStateData appState = ApplicationStateData.newInstance(
+ rmApp.getSubmitTime(), rmApp.getStartTime(),
+ rmApp.getApplicationSubmissionContext(), rmApp.getUser(),
+ rmApp.getRealUser(), rmApp.getCallerContext());
+ appState.setApplicationTimeouts(rmApp.getApplicationTimeouts());
+ appState.setLaunchTime(rmApp.getLaunchTime());
+ rmContext.getStateStore().updateApplicationStateSynchronously(appState,
+ false, future);
+
+ // As we use iterator over a TreeSet for OrderingPolicy, once we change
+ // priority then reinsert back to make order correct.
+ LeafQueue queue = (LeafQueue) getQueue(rmApp.getQueue());
+ queue.updateApplicationPriority(application, appPriority);
+
+ LOG.info("Priority '" + appPriority + "' is updated in queue :"
+ + rmApp.getQueue() + " for application: " + applicationId
+ + " for the user: " + rmApp.getUser());
+ return appPriority;
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public PreemptionManager getPreemptionManager() {
+ return preemptionManager;
+ }
+
+ @Override
+ public ResourceUsage getClusterResourceUsage() {
+ return getRootQueue().getQueueResourceUsage();
+ }
+
+ private SchedulerContainer getSchedulerContainer(
+ RMContainer rmContainer, boolean allocated) {
+ if (null == rmContainer) {
+ return null;
+ }
+
+ FiCaSchedulerApp app = getApplicationAttempt(
+ rmContainer.getApplicationAttemptId());
+ if (null == app) {
+ return null;
+ }
+
+ NodeId nodeId;
+ // Get nodeId
+ if (rmContainer.getState() == RMContainerState.RESERVED) {
+ nodeId = rmContainer.getReservedNode();
+ } else {
+ nodeId = rmContainer.getNodeId();
+ }
+
+ FiCaSchedulerNode node = getNode(nodeId);
+ if (null == node) {
+ return null;
+ }
+ return new SchedulerContainer<>(app, node, rmContainer,
+ // TODO, node partition should come from CSAssignment to avoid partition
+ // get updated before submitting the commit
+ node.getPartition(), allocated);
+ }
+
+ private List>
+ getSchedulerContainersToRelease(
+ CSAssignment csAssignment) {
+ List> list = null;
+
+ if (csAssignment.getContainersToKill() != null && !csAssignment
+ .getContainersToKill().isEmpty()) {
+ list = new ArrayList<>();
+ for (RMContainer rmContainer : csAssignment.getContainersToKill()) {
+ SchedulerContainer schedulerContainer =
+ getSchedulerContainer(rmContainer, false);
+ if (schedulerContainer != null) {
+ list.add(schedulerContainer);
+ }
+ }
+ }
+
+ if (csAssignment.getExcessReservation() != null) {
+ if (null == list) {
+ list = new ArrayList<>();
+ }
+ SchedulerContainer schedulerContainer =
+ getSchedulerContainer(csAssignment.getExcessReservation(), false);
+ if (schedulerContainer != null) {
+ list.add(schedulerContainer);
+ }
+ }
+
+ if (list != null && list.isEmpty()) {
+ list = null;
+ }
+ return list;
+ }
+
+ @VisibleForTesting
+ public void submitResourceCommitRequest(Resource cluster,
+ CSAssignment csAssignment) {
+ ResourceCommitRequest request =
+ createResourceCommitRequest(csAssignment);
+
+ if (null == request) {
+ return;
+ }
+
+ if (scheduleAsynchronously) {
+ // Submit to a commit thread and commit it async-ly
+ resourceCommitterService.addNewCommitRequest(request);
+ } else {
+ // Otherwise do it sync-ly.
+ tryCommit(cluster, request, true);
+ }
+ }
+
+ @Override
+ public boolean attemptAllocationOnNode(SchedulerApplicationAttempt appAttempt,
+ SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) {
+ if (schedulingRequest.getResourceSizing() != null) {
+ if (schedulingRequest.getResourceSizing().getNumAllocations() > 1) {
+ LOG.warn("The SchedulingRequest has requested more than 1 allocation," +
+ " but only 1 will be attempted !!");
+ }
+ if (!appAttempt.isStopped()) {
+ ResourceCommitRequest
+ resourceCommitRequest = createResourceCommitRequest(
+ appAttempt, schedulingRequest, schedulerNode);
+
+ // Validate placement constraint is satisfied before
+ // committing the request.
+ try {
+ if (!PlacementConstraintsUtil.canSatisfyConstraints(
+ appAttempt.getApplicationId(),
+ schedulingRequest, schedulerNode,
+ rmContext.getPlacementConstraintManager(),
+ rmContext.getAllocationTagsManager())) {
+ LOG.info("Failed to allocate container for application "
+ + appAttempt.getApplicationId() + " on node "
+ + schedulerNode.getNodeName()
+ + " because this allocation violates the"
+ + " placement constraint.");
+ return false;
+ }
+ } catch (InvalidAllocationTagsQueryException e) {
+ LOG.warn("Unable to allocate container", e);
+ return false;
+ }
+ return tryCommit(getClusterResource(), resourceCommitRequest, false);
+ }
+ }
+ return false;
+ }
+
+ // This assumes numContainers = 1 for the request.
+ private ResourceCommitRequest
+ createResourceCommitRequest(SchedulerApplicationAttempt appAttempt,
+ SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) {
+ ContainerAllocationProposal allocated =
+ null;
+ Resource resource = schedulingRequest.getResourceSizing().getResources();
+ if (Resources.greaterThan(calculator, getClusterResource(),
+ resource, Resources.none())) {
+ ContainerId cId =
+ ContainerId.newContainerId(appAttempt.getApplicationAttemptId(),
+ appAttempt.getAppSchedulingInfo().getNewContainerId());
+ Container container = BuilderUtils.newContainer(
+ cId, schedulerNode.getNodeID(), schedulerNode.getHttpAddress(),
+ resource, schedulingRequest.getPriority(), null,
+ ExecutionType.GUARANTEED,
+ schedulingRequest.getAllocationRequestId());
+ RMContainer rmContainer = new RMContainerImpl(container,
+ SchedulerRequestKey.extractFrom(container),
+ appAttempt.getApplicationAttemptId(), container.getNodeId(),
+ appAttempt.getUser(), rmContext, false);
+ ((RMContainerImpl) rmContainer).setAllocationTags(
+ new HashSet<>(schedulingRequest.getAllocationTags()));
+
+ SchedulerContainer
+ schedulerContainer = getSchedulerContainer(rmContainer, true);
+ if (schedulerContainer == null) {
+ allocated = null;
+ } else {
+ allocated = new ContainerAllocationProposal<>(schedulerContainer,
+ null, null, NodeType.NODE_LOCAL, NodeType.NODE_LOCAL,
+ SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, resource);
+ }
+ }
+
+ if (null != allocated) {
+ List>
+ allocationsList = new ArrayList<>();
+ allocationsList.add(allocated);
+
+ return new ResourceCommitRequest<>(allocationsList, null, null);
+ }
+ return null;
+ }
+
+ @VisibleForTesting
+ public ResourceCommitRequest
+ createResourceCommitRequest(CSAssignment csAssignment) {
+ ContainerAllocationProposal allocated =
+ null;
+ ContainerAllocationProposal reserved =
+ null;
+ List> released =
+ null;
+
+ if (Resources.greaterThan(calculator, getClusterResource(),
+ csAssignment.getResource(), Resources.none())) {
+ // Allocated something
+ List allocations =
+ csAssignment.getAssignmentInformation().getAllocationDetails();
+ if (!allocations.isEmpty()) {
+ RMContainer rmContainer = allocations.get(0).rmContainer;
+ SchedulerContainer
+ schedulerContainer = getSchedulerContainer(rmContainer, true);
+ if (schedulerContainer == null) {
+ allocated = null;
+ // Decrease unconfirmed resource if app is alive
+ FiCaSchedulerApp app = getApplicationAttempt(
+ rmContainer.getApplicationAttemptId());
+ if (app != null) {
+ app.decUnconfirmedRes(rmContainer.getAllocatedResource());
+ }
+ } else {
+ allocated = new ContainerAllocationProposal<>(schedulerContainer,
+ getSchedulerContainersToRelease(csAssignment),
+ getSchedulerContainer(
+ csAssignment.getFulfilledReservedContainer(), false),
+ csAssignment.getType(), csAssignment.getRequestLocalityType(),
+ csAssignment.getSchedulingMode() != null ?
+ csAssignment.getSchedulingMode() :
+ SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY,
+ csAssignment.getResource());
+ }
+ }
+
+ // Reserved something
+ List reservation =
+ csAssignment.getAssignmentInformation().getReservationDetails();
+ if (!reservation.isEmpty()) {
+ RMContainer rmContainer = reservation.get(0).rmContainer;
+ SchedulerContainer
+ schedulerContainer = getSchedulerContainer(rmContainer, false);
+ if (schedulerContainer == null) {
+ reserved = null;
+ } else {
+ reserved = new ContainerAllocationProposal<>(schedulerContainer,
+ getSchedulerContainersToRelease(csAssignment),
+ getSchedulerContainer(
+ csAssignment.getFulfilledReservedContainer(), false),
+ csAssignment.getType(), csAssignment.getRequestLocalityType(),
+ csAssignment.getSchedulingMode() != null ?
+ csAssignment.getSchedulingMode() :
+ SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY,
+ csAssignment.getResource());
+ }
+ }
+ }
+
+ // When we don't need to allocate/reserve anything, we can feel free to
+ // kill all to-release containers in the request.
+ if (null == allocated && null == reserved) {
+ released = getSchedulerContainersToRelease(csAssignment);
+ }
+
+ if (null != allocated || null != reserved || (null != released && !released
+ .isEmpty())) {
+ List>
+ allocationsList = null;
+ if (allocated != null) {
+ allocationsList = new ArrayList<>();
+ allocationsList.add(allocated);
+ }
+
+ List>
+ reservationsList = null;
+ if (reserved != null) {
+ reservationsList = new ArrayList<>();
+ reservationsList.add(reserved);
+ }
+
+ return new ResourceCommitRequest<>(allocationsList, reservationsList,
+ released);
+ }
+
+ return null;
+ }
+
+ @Override
+ public boolean tryCommit(Resource cluster, ResourceCommitRequest r,
+ boolean updatePending) {
+ long commitStart = System.nanoTime();
+ ResourceCommitRequest request =
+ (ResourceCommitRequest) r;
+
+ ApplicationAttemptId attemptId = null;
+
+ // We need to update unconfirmed allocated resource of application when
+ // any container allocated.
+ boolean updateUnconfirmedAllocatedResource =
+ request.getContainersToAllocate() != null && !request
+ .getContainersToAllocate().isEmpty();
+
+ // find the application to accept and apply the ResourceCommitRequest
+ if (request.anythingAllocatedOrReserved()) {
+ ContainerAllocationProposal c =
+ request.getFirstAllocatedOrReservedContainer();
+ attemptId =
+ c.getAllocatedOrReservedContainer().getSchedulerApplicationAttempt()
+ .getApplicationAttemptId();
+ } else {
+ if (!request.getContainersToRelease().isEmpty()) {
+ attemptId = request.getContainersToRelease().get(0)
+ .getSchedulerApplicationAttempt().getApplicationAttemptId();
+ }
+ }
+
+ LOG.debug("Try to commit allocation proposal={}", request);
+
+ boolean isSuccess = false;
+ if (attemptId != null) {
+ FiCaSchedulerApp app = getApplicationAttempt(attemptId);
+ // Required sanity check for attemptId - when async-scheduling enabled,
+ // proposal might be outdated if AM failover just finished
+ // and proposal queue was not be consumed in time
+ if (app != null && attemptId.equals(app.getApplicationAttemptId())) {
+ if (app.accept(cluster, request, updatePending)
+ && app.apply(cluster, request, updatePending)) {
+ long commitSuccess = System.nanoTime() - commitStart;
+ CapacitySchedulerMetrics.getMetrics()
+ .addCommitSuccess(commitSuccess);
+ LOG.info("Allocation proposal accepted");
+ isSuccess = true;
+ } else {
+ long commitFailed = System.nanoTime() - commitStart;
+ CapacitySchedulerMetrics.getMetrics()
+ .addCommitFailure(commitFailed);
+ LOG.info("Failed to accept allocation proposal");
+ }
+
+ LOG.debug("Allocation proposal accepted={}, proposal={}", isSuccess,
+ request);
+
+ // Update unconfirmed allocated resource.
+ if (updateUnconfirmedAllocatedResource) {
+ app.decUnconfirmedRes(request.getTotalAllocatedResource());
+ }
+ }
+ }
+ return isSuccess;
+ }
+
+ public int getAsyncSchedulingPendingBacklogs() {
+ if (scheduleAsynchronously) {
+ return resourceCommitterService.getPendingBacklogs();
+ }
+ return 0;
+ }
+
+ @Override
+ public CapacitySchedulerQueueManager getCapacitySchedulerQueueManager() {
+ return this.queueManager;
+ }
+
+ public WorkflowPriorityMappingsManager getWorkflowPriorityMappingsManager() {
+ return this.workflowPriorityMappingsMgr;
+ }
+
+ /**
+ * Try to move a reserved container to a targetNode.
+ * If the targetNode is reserved by another application (other than this one).
+ * The previous reservation will be cancelled.
+ *
+ * @param toBeMovedContainer reserved container will be moved
+ * @param targetNode targetNode
+ * @return true if move succeeded. Return false if the targetNode is reserved by
+ * a different container or move failed because of any other reasons.
+ */
+ public boolean moveReservedContainer(RMContainer toBeMovedContainer,
+ FiCaSchedulerNode targetNode) {
+ writeLock.lock();
+ try {
+ LOG.debug("Trying to move container={} to node={}",
+ toBeMovedContainer, targetNode.getNodeID());
+
+ FiCaSchedulerNode sourceNode = getNode(toBeMovedContainer.getNodeId());
+ if (null == sourceNode) {
+ LOG.debug("Failed to move reservation, cannot find source node={}",
+ toBeMovedContainer.getNodeId());
+ return false;
+ }
+
+ // Target node updated?
+ if (getNode(targetNode.getNodeID()) != targetNode) {
+ LOG.debug("Failed to move reservation, node updated or removed,"
+ + " moving cancelled.");
+ return false;
+ }
+
+ // Target node's reservation status changed?
+ if (targetNode.getReservedContainer() != null) {
+ LOG.debug("Target node's reservation status changed,"
+ + " moving cancelled.");
+ return false;
+ }
+
+ FiCaSchedulerApp app = getApplicationAttempt(
+ toBeMovedContainer.getApplicationAttemptId());
+ if (null == app) {
+ LOG.debug("Cannot find to-be-moved container's application={}",
+ toBeMovedContainer.getApplicationAttemptId());
+ return false;
+ }
+
+ // finally, move the reserved container
+ return app.moveReservation(toBeMovedContainer, sourceNode, targetNode);
+ } finally {
+ writeLock.unlock();
+ }
+ }
+
+ @Override
+ public long checkAndGetApplicationLifetime(String queueName,
+ long lifetimeRequestedByApp) {
+ readLock.lock();
+ try {
+ CSQueue queue = getQueue(queueName);
+ if (queue == null || !(queue instanceof LeafQueue)) {
+ return lifetimeRequestedByApp;
+ }
+
+ long defaultApplicationLifetime =
+ ((LeafQueue) queue).getDefaultApplicationLifetime();
+ long maximumApplicationLifetime =
+ ((LeafQueue) queue).getMaximumApplicationLifetime();
+
+ // check only for maximum, that's enough because default can't
+ // exceed maximum
+ if (maximumApplicationLifetime <= 0) {
+ return (lifetimeRequestedByApp <= 0) ? defaultApplicationLifetime :
+ lifetimeRequestedByApp;
+ }
+
+ if (lifetimeRequestedByApp <= 0) {
+ return defaultApplicationLifetime;
+ } else if (lifetimeRequestedByApp > maximumApplicationLifetime) {
+ return maximumApplicationLifetime;
+ }
+ return lifetimeRequestedByApp;
+ } finally {
+ readLock.unlock();
+ }
+ }
+
+ @Override
+ public long getMaximumApplicationLifetime(String queueName) {
+ CSQueue queue = getQueue(queueName);
+ if (queue == null || !(queue instanceof LeafQueue)) {
+ if (isAmbiguous(queueName)) {
+ LOG.error("Ambiguous queue reference: " + queueName
+ + " please use full queue path instead.");
+ } else {
+ LOG.error("Unknown queue: " + queueName);
+ }
+ return -1;
+ }
+ // In seconds
+ return ((LeafQueue) queue).getMaximumApplicationLifetime();
+ }
+
+ @Override
+ public boolean isConfigurationMutable() {
+ return csConfProvider instanceof MutableConfigurationProvider;
+ }
+
+ @Override
+ public MutableConfigurationProvider getMutableConfProvider() {
+ if (isConfigurationMutable()) {
+ return (MutableConfigurationProvider) csConfProvider;
+ }
+ return null;
+ }
+
+ private LeafQueue autoCreateLeafQueue(
+ ApplicationPlacementContext placementContext)
+ throws IOException, YarnException {
+
+ AutoCreatedLeafQueue autoCreatedLeafQueue = null;
+
+ String leafQueueName = placementContext.getQueue();
+ String parentQueueName = placementContext.getParentQueue();
+
+ if (!StringUtils.isEmpty(parentQueueName)) {
+ CSQueue parentQueue = getQueue(parentQueueName);
+
+ if (parentQueue != null && conf.isAutoCreateChildQueueEnabled(
+ parentQueue.getQueuePath())) {
+
+ ManagedParentQueue autoCreateEnabledParentQueue =
+ (ManagedParentQueue) parentQueue;
+ autoCreatedLeafQueue = new AutoCreatedLeafQueue(this, leafQueueName,
+ autoCreateEnabledParentQueue);
+
+ addQueue(autoCreatedLeafQueue);
+
+ } else {
+ throw new SchedulerDynamicEditException(
+ "Could not auto-create leaf queue for " + leafQueueName
+ + ". Queue mapping specifies an invalid parent queue "
+ + "which does not exist "
+ + parentQueueName);
+ }
+ } else {
+ throw new SchedulerDynamicEditException(
+ "Could not auto-create leaf queue for " + leafQueueName
+ + ". Queue mapping does not specify"
+ + " which parent queue it needs to be created under.");
+ }
+ return autoCreatedLeafQueue;
+ }
+
+ @Override
+ public void resetSchedulerMetrics() {
+ CapacitySchedulerMetrics.destroy();
+ }
+
+ public boolean isMultiNodePlacementEnabled() {
+ return multiNodePlacementEnabled;
+ }
+
+ public int getNumAsyncSchedulerThreads() {
+ return asyncSchedulerThreads == null ? 0 : asyncSchedulerThreads.size();
+ }
+
+ @VisibleForTesting
+ public void setMaxRunningAppsEnforcer(CSMaxRunningAppsEnforcer enforcer) {
+ this.maxRunningEnforcer = enforcer;
+ }
+
+ /**
+ * Returning true as capacity scheduler supports placement constraints.
+ */
+ @Override
+ public boolean placementConstraintEnabled() {
+ return true;
+ }
+
+ @VisibleForTesting
+ public void setQueueManager(CapacitySchedulerQueueManager qm) {
+ this.queueManager = qm;
+ }
}
diff --git a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LoadBasedCapacityScheduler.java b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LoadBasedCapacityScheduler.java
new file mode 100644
index 0000000000000000000000000000000000000000..a95702de692541e11668815af5e130d69906bf21
--- /dev/null
+++ b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LoadBasedCapacityScheduler.java
@@ -0,0 +1,7 @@
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
+
+public class LoadBasedCapacityScheduler extends CapacityScheduler {
+ public LoadBasedCapacityScheduler() {
+ super(LoadBasedCapacityScheduler.class.getName());
+ }
+}
\ No newline at end of file
diff --git a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/SchedulerInfo.java b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/SchedulerInfo.java
index ede0d15105adc065df7c4bdc953cae5de7657aa9..f966b67e0908f4e8576652de88bf2725fa96a596 100644
--- a/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/SchedulerInfo.java
+++ b/load-based-schedule/yarn-schedule-load-evolution/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/SchedulerInfo.java
@@ -28,12 +28,13 @@ import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LoadBasedCapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
@XmlRootElement
@XmlSeeAlso({ CapacitySchedulerInfo.class, FairSchedulerInfo.class,
- FifoSchedulerInfo.class })
+ FifoSchedulerInfo.class })
public class SchedulerInfo {
protected String schedulerName;
protected ResourceInfo minAllocResource;
@@ -48,7 +49,9 @@ public class SchedulerInfo {
public SchedulerInfo(final ResourceManager rm) {
ResourceScheduler rs = rm.getResourceScheduler();
- if (rs instanceof CapacityScheduler) {
+ if (rs instanceof LoadBasedCapacityScheduler) {
+ this.schedulerName = "LoadBasedCapacity Scheduler";
+ }else if (rs instanceof CapacityScheduler) {
this.schedulerName = "Capacity Scheduler";
} else if (rs instanceof FairScheduler) {
this.schedulerName = "Fair Scheduler";
@@ -61,7 +64,7 @@ public class SchedulerInfo {
this.maxAllocResource = new ResourceInfo(rs.getMaximumResourceCapability());
this.schedulingResourceTypes = rs.getSchedulingResourceTypes();
this.maximumClusterPriority =
- rs.getMaxClusterLevelAppPriority().getPriority();
+ rs.getMaxClusterLevelAppPriority().getPriority();
}
public String getSchedulerType() {