Speed up `Jenkins._cleanUpDisconnectComputers` (#11102)

This commit is contained in:
Kris Stern 2025-09-30 09:08:50 +08:00 committed by GitHub
commit a68faf9554
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 27 additions and 33 deletions

View File

@ -94,10 +94,6 @@ public abstract class AbstractCIBase extends Node implements ItemGroup<TopLevelI
ViewJob.interruptReloadThread();
}
protected void killComputer(Computer c) {
c.kill();
}
private final Set<String> disabledAdministrativeMonitors = new HashSet<>();
/**
@ -267,12 +263,12 @@ public abstract class AbstractCIBase extends Node implements ItemGroup<TopLevelI
// we need to start the process of reducing the executors on all computers as distinct
// from the killing action which should not excessively use the Queue lock.
for (Computer c : old) {
c.inflictMortalWound();
c.setNumExecutors(0);
}
});
for (Computer c : old) {
// when we get to here, the number of executors should be zero so this call should not need the Queue.lock
killComputer(c);
c.kill();
}
getQueue().scheduleMaintenance();
Listeners.notify(ComputerListener.class, false, ComputerListener::onConfigurationChange);

View File

@ -827,23 +827,6 @@ public /*transient*/ abstract class Computer extends Actionable implements Acces
setNumExecutors(0);
}
/**
* Called by {@link Jenkins#updateComputerList(boolean, Collection)} to notify {@link Computer} that it will be discarded.
*
* <p>
* Note that at this point {@link #getNode()} returns null.
*
* <p>
* Note that the Queue lock is already held when this method is called.
*
* @see #onRemoved()
*/
@Restricted(NoExternalUse.class)
@GuardedBy("hudson.model.Queue.lock")
/*package*/ void inflictMortalWound() {
setNumExecutors(0);
}
/**
* Called by {@link Jenkins} when this computer is removed.
*
@ -865,7 +848,7 @@ public /*transient*/ abstract class Computer extends Actionable implements Acces
* Calling path, *means protected by Queue.withLock
*
* Computer.doConfigSubmit -> Computer.replaceBy ->Jenkins.setNodes* ->Computer.setNode
* AbstractCIBase.updateComputerList->Computer.inflictMortalWound*
* AbstractCIBase.updateComputerList->Computer.setNumExecutors*
* AbstractCIBase.updateComputerList->AbstractCIBase.updateComputer* ->Computer.setNode
* AbstractCIBase.updateComputerList->AbstractCIBase.killComputer->Computer.kill
* Computer.constructor->Computer.setNode
@ -873,8 +856,9 @@ public /*transient*/ abstract class Computer extends Actionable implements Acces
*
* @param n number of executors
*/
@Restricted(NoExternalUse.class)
@GuardedBy("hudson.model.Queue.lock")
private void setNumExecutors(int n) {
public void setNumExecutors(int n) {
this.numExecutors = n;
final int diff = executors.size() - n;

View File

@ -916,12 +916,7 @@ public class SlaveComputer extends Computer {
protected void kill() {
super.kill();
closeChannel();
try {
log.close();
} catch (IOException x) {
LOGGER.log(Level.WARNING, "Failed to close agent log", x);
}
closeLog();
try {
Util.deleteRecursive(getLogDir());
} catch (IOException ex) {
@ -929,6 +924,15 @@ public class SlaveComputer extends Computer {
}
}
@Restricted(NoExternalUse.class)
public void closeLog() {
try {
log.close();
} catch (IOException x) {
LOGGER.log(Level.WARNING, "Failed to close agent log", x);
}
}
@Override
public RetentionStrategy getRetentionStrategy() {
Slave n = getNode();

View File

@ -176,6 +176,7 @@ import hudson.slaves.NodePropertyDescriptor;
import hudson.slaves.NodeProvisioner;
import hudson.slaves.OfflineCause;
import hudson.slaves.RetentionStrategy;
import hudson.slaves.SlaveComputer;
import hudson.tasks.BuildWrapper;
import hudson.tasks.Builder;
import hudson.tasks.Publisher;
@ -3775,7 +3776,10 @@ public class Jenkins extends AbstractCIBase implements DirectlyModifiableTopLeve
for (Computer c : getComputersCollection()) {
try {
c.interrupt();
killComputer(c);
c.setNumExecutors(0);
if (Main.isUnitTest && c instanceof SlaveComputer sc) {
sc.closeLog(); // help TemporaryDirectoryAllocator.dispose esp. on Windows
}
pending.add(c.disconnect(null));
} catch (OutOfMemoryError e) {
// we should just propagate this, no point trying to log
@ -3950,9 +3954,15 @@ public class Jenkins extends AbstractCIBase implements DirectlyModifiableTopLeve
if (!pending.isEmpty()) {
LOGGER.log(Main.isUnitTest ? Level.FINE : Level.INFO, "Waiting for node disconnection completion");
}
long end = System.nanoTime() + Duration.ofSeconds(10).toNanos();
for (Future<?> f : pending) {
try {
f.get(10, TimeUnit.SECONDS); // if clean up operation didn't complete in time, we fail the test
long remaining = end - System.nanoTime();
if (remaining <= 0) {
LOGGER.warning("Ran out of time waiting for agents to disconnect");
break;
}
f.get(remaining, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break; // someone wants us to die now. quick!