mirror of
				https://github.com/ohwgiles/laminar.git
				synced 2025-06-13 12:54:29 +00:00 
			
		
		
		
	more aggressive, recursive kill on abort
some jobs may spawn child processes which change their process group or session id, making the existing abort behavior of HUP-then-KILL to the process group ineffective. Instead, if HUP to the process group fails, recursively walk /proc and KILL anything with a ppid corresponding to the leader process. This should keep working because the leader process is a subreaper. resolves #129
This commit is contained in:
		
							parent
							
								
									7f7e8d2455
								
							
						
					
					
						commit
						8a4992e6af
					
				| @ -20,6 +20,7 @@ | ||||
| #include <string> | ||||
| #include <unistd.h> | ||||
| #include <queue> | ||||
| #include <dirent.h> | ||||
| #include <sys/prctl.h> | ||||
| #include <sys/types.h> | ||||
| #include <sys/wait.h> | ||||
| @ -45,6 +46,39 @@ struct Script { | ||||
|     bool runOnAbort; | ||||
| }; | ||||
| 
 | ||||
| static void aggressive_recursive_kill(pid_t parent) { | ||||
|     DIR* proc = opendir("/proc"); | ||||
|     if(!proc) | ||||
|         return; | ||||
| 
 | ||||
|     while(struct dirent* de = readdir(proc)) { | ||||
|         if(!isdigit(*de->d_name)) | ||||
|             continue; | ||||
| 
 | ||||
|         char status_file[640]; | ||||
|         sprintf(status_file, "/proc/%s/status", de->d_name); | ||||
| 
 | ||||
|         FILE* status_fp = fopen(status_file, "rb"); | ||||
|         if(!status_fp) | ||||
|             continue; | ||||
| 
 | ||||
|         char status_buffer[512]; | ||||
|         int n = fread(status_buffer, 1, 512, status_fp); | ||||
|         if(char* p = (char*)memmem(status_buffer, n, "PPid:\t", 6)) { | ||||
|             pid_t ppid = strtol(p + 6, NULL, 10); | ||||
|             if(ppid == parent) { | ||||
|                 pid_t pid = atoi(de->d_name); | ||||
|                 aggressive_recursive_kill(pid); | ||||
|                 fprintf(stderr, "[laminar] sending SIGKILL to pid %d\n", pid); | ||||
|                 kill(pid, SIGKILL); | ||||
|             } | ||||
|         } | ||||
|         fclose(status_fp); | ||||
|     } | ||||
|     closedir(proc); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| class Leader final : public kj::TaskSet::ErrorHandler { | ||||
| public: | ||||
|     Leader(kj::AsyncIoContext& ioContext, kj::Filesystem& fs, const char* jobName, uint runNumber); | ||||
| @ -67,6 +101,7 @@ private: | ||||
|     pid_t currentScriptPid; | ||||
|     std::queue<Script> scripts; | ||||
|     int setEnvPipe[2]; | ||||
|     bool aborting; | ||||
| }; | ||||
| 
 | ||||
| Leader::Leader(kj::AsyncIoContext &ioContext, kj::Filesystem &fs, const char *jobName, uint runNumber) : | ||||
| @ -76,7 +111,8 @@ Leader::Leader(kj::AsyncIoContext &ioContext, kj::Filesystem &fs, const char *jo | ||||
|     home(fs.getCurrent()), | ||||
|     rootPath(fs.getCurrentPath()), | ||||
|     jobName(jobName), | ||||
|     runNumber(runNumber) | ||||
|     runNumber(runNumber), | ||||
|     aborting(false) | ||||
| { | ||||
|     tasks.add(ioContext.unixEventPort.onSignal(SIGTERM).then([this](siginfo_t) { | ||||
|         while(scripts.size() && (!scripts.front().runOnAbort)) | ||||
| @ -84,8 +120,8 @@ Leader::Leader(kj::AsyncIoContext &ioContext, kj::Filesystem &fs, const char *jo | ||||
|         // TODO: probably shouldn't do this if we are already in a runOnAbort script
 | ||||
|         kill(-currentGroupId, SIGTERM); | ||||
|         return this->ioContext.provider->getTimer().afterDelay(2*kj::SECONDS).then([this]{ | ||||
|             fprintf(stderr, "[laminar] sending SIGKILL to process group %d\n", currentGroupId); | ||||
|             kill(-currentGroupId, SIGKILL); | ||||
|             aborting = true; | ||||
|             aggressive_recursive_kill(getpid()); | ||||
|         }); | ||||
|     })); | ||||
| 
 | ||||
| @ -223,14 +259,18 @@ kj::Promise<void> Leader::reapChildProcesses() | ||||
|                     // waiting for is done
 | ||||
|                     return reapChildProcesses(); | ||||
|                 } | ||||
|                 // we were aborted by the primary process already, just wait until all
 | ||||
|                 // SIGKILLs are processed
 | ||||
|                 if(aborting) { | ||||
|                     return reapChildProcesses(); | ||||
|                 } | ||||
|                 // Otherwise, reparented orphans are on borrowed time
 | ||||
|                 // TODO list wayward processes?
 | ||||
|                 fprintf(stderr, "[laminar] sending SIGHUP to adopted child processes\n"); | ||||
|                 kill(-currentGroupId, SIGHUP); | ||||
|                 return ioContext.provider->getTimer().afterDelay(5*kj::SECONDS).then([this]{ | ||||
|                     fprintf(stderr, "[laminar] sending SIGKILL to process group %d\n", currentGroupId); | ||||
|                     // TODO: should we mark the job as failed if we had to kill reparented processes?
 | ||||
|                     kill(-currentGroupId, SIGKILL); | ||||
|                     aggressive_recursive_kill(getpid()); | ||||
|                     return reapChildProcesses(); | ||||
|                 }).exclusiveJoin(reapChildProcesses()); | ||||
|             } else if(pid == currentScriptPid) { | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user