pbs_node_rampdown.py 319 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. def convert_time(fmt, tm, fixdate=False):
  38. """
  39. Convert given time stamp <tm> into given format <fmt>
  40. if fixdate is True add <space> before date if date is < 9
  41. (This is because to match output with ctime as qstat uses it)
  42. """
  43. rv = time.strftime(fmt, time.localtime(float(tm)))
  44. if ((sys.platform not in ('cygwin', 'win32')) and (fixdate)):
  45. rv = rv.split()
  46. date = int(rv[2])
  47. if date <= 9:
  48. date = ' ' + str(date)
  49. rv[2] = str(date)
  50. rv = ' '.join(rv)
  51. return rv
  52. class TestPbsNodeRampDown(TestFunctional):
  53. """
  54. This tests the Node Rampdown Feature,
  55. where while a job is running, nodes/resources
  56. assigned by non-mother superior can be released.
  57. Custom parameters:
  58. moms: colon-separated hostnames of three MoMs
  59. """
  60. def transform_select(self, select):
  61. """
  62. Takes a select substring:
  63. "<res1>=<val1>:<res2>=<val2>...:<resN>=<valN>"
  64. and transform it so that if any of the resource
  65. (res1, res2,...,resN) matches 'mem', and
  66. the corresponding value has a suffix of 'gb',
  67. then convert it too 'kb' value. Also,
  68. this will attach a "1:' to the returned select
  69. substring.
  70. Ex:
  71. % str = "ncpus=7:mem=2gb:ompthreads=3"
  72. % transform_select(str)
  73. 1:ompthreads=3:mem=2097152kb:ncpus=7
  74. """
  75. sel_list = select.split(':')
  76. mystr = "1:"
  77. for index in range(len(sel_list) - 1, -1, -1):
  78. if (index != len(sel_list) - 1):
  79. mystr += ":"
  80. nums = [s for s in sel_list[index] if s.isdigit()]
  81. key = sel_list[index].split('=')[0]
  82. if key == "mem":
  83. mystr += sel_list[index].\
  84. replace(nums[0] + "gb",
  85. str(int(nums[0]) * 1024 * 1024)) + "kb"
  86. else:
  87. mystr += sel_list[index]
  88. return mystr
  89. def pbs_nodefile_match_exec_host(self, jid, exec_host,
  90. schedselect=None):
  91. """
  92. Look into the PBS_NODEFILE on the first host listed in 'exec_host'
  93. and returns True if all host entries in 'exec_host' match the entries
  94. in the file. Otherwise, return False.
  95. # Look for 'mpiprocs' values in 'schedselect' (if not None), and
  96. # verify that the corresponding node hosts are appearing in
  97. # PBS_NODEFILE 'mpiprocs' number of times.
  98. """
  99. pbs_nodefile = os.path.join(self.server.
  100. pbs_conf['PBS_HOME'], 'aux', jid)
  101. # look for mpiprocs settings
  102. mpiprocs = []
  103. if schedselect is not None:
  104. select_list = schedselect.split('+')
  105. for chunk in select_list:
  106. chl = chunk.split(':')
  107. for ch in chl:
  108. if ch.find('=') != -1:
  109. c = ch.split('=')
  110. if c[0] == "mpiprocs":
  111. mpiprocs.append(c[1])
  112. ehost = exec_host.split('+')
  113. first_host = ehost[0].split('/')[0]
  114. cmd = ['cat', pbs_nodefile]
  115. ret = self.server.du.run_cmd(first_host, cmd, sudo=False)
  116. ehost2 = []
  117. for h in ret['out']:
  118. ehost2.append(h.split('.')[0])
  119. ehost1 = []
  120. j = 0
  121. for eh in ehost:
  122. h = eh.split('/')
  123. if (len(mpiprocs) > 0):
  124. for k in range(int(mpiprocs[j])):
  125. ehost1.append(h[0])
  126. else:
  127. ehost1.append(h[0])
  128. j += 1
  129. if cmp(ehost1, ehost2) != 0:
  130. return False
  131. return True
  132. def license_count_match(self, num_licenses):
  133. """
  134. This will fail on an assert if server's license_count used value
  135. does not equal 'num_licenses'
  136. """
  137. n = retry = 5
  138. for _ in range(n):
  139. server_stat = self.server.status(SERVER, 'license_count')
  140. lic_count = server_stat[0]['license_count']
  141. for lic in lic_count.split():
  142. lic_split = lic.split(':')
  143. if lic_split[0] == 'Used':
  144. actual_licenses = int(lic_split[1])
  145. if actual_licenses == num_licenses:
  146. return
  147. break
  148. retry -= 1
  149. if retry == 0:
  150. raise AssertionError("not found %d licenses" % (num_licenses,))
  151. self.logger.info("sleeping 3 secs before next retry")
  152. time.sleep(3)
  153. def match_accounting_log(self, atype, jid, exec_host, exec_vnode,
  154. mem, ncpus, nodect, place, select):
  155. """
  156. This checks if there's an accounting log record 'atype' for
  157. job 'jid' containing the values given (i.e.
  158. Resource_List.exec_host, Resource_List.exec_vnode, etc...)
  159. This throws an exception upon encountering a non-matching
  160. accounting_logs entry.
  161. Some example values of 'atype' are: 'u' (update record due to
  162. release node request), 'c' (record containing the next
  163. set of resources to be used by a phased job as a result of
  164. release node request), 'e' (last update record for a phased job
  165. due to a release node request), 'E' (end of job record).
  166. """
  167. self.server.accounting_match(
  168. msg=".*%s;%s.*exec_host=%s.*" % (atype, jid, exec_host),
  169. regexp=True, n=20)
  170. self.server.accounting_match(
  171. msg=".*%s;%s.*exec_vnode=%s.*" % (atype, jid, exec_vnode),
  172. regexp=True, n=20)
  173. self.server.accounting_match(
  174. msg=".*%s;%s.*Resource_List\.mem=%s.*" % (atype, jid, mem),
  175. regexp=True, n=20)
  176. self.server.accounting_match(
  177. msg=".*%s;%s.*Resource_List\.ncpus=%d.*" % (atype, jid, ncpus),
  178. regexp=True, n=20)
  179. self.server.accounting_match(
  180. msg=".*%s;%s.*Resource_List\.nodect=%d.*" % (atype, jid, nodect),
  181. regexp=True, n=20)
  182. self.server.accounting_match(
  183. msg=".*%s;%s.*Resource_List\.place=%s.*" % (atype, jid, place),
  184. regexp=True, n=20)
  185. self.server.accounting_match(
  186. msg=".*%s;%s.*Resource_List\.select=%s.*" % (atype, jid, select),
  187. regexp=True, n=20)
  188. if atype != 'c':
  189. self.server.accounting_match(
  190. msg=".*%s;%s.*resources_used\..*" % (atype, jid),
  191. regexp=True, n=20)
  192. def match_vnode_status(self, vnode_list, state, jobs=None, ncpus=None,
  193. mem=None):
  194. """
  195. Given a list of vnode names in 'vnode_list', check to make
  196. sure each vnode's state, jobs string, resources_assigned.mem,
  197. and resources_assigned.ncpus match the passed arguments.
  198. This will throw an exception if a match is not found.
  199. """
  200. for vn in vnode_list:
  201. dict_match = {'state': state}
  202. if jobs is not None:
  203. dict_match['jobs'] = jobs
  204. if ncpus is not None:
  205. dict_match['resources_assigned.ncpus'] = ncpus
  206. if mem is not None:
  207. dict_match['resources_assigned.mem'] = mem
  208. self.server.expect(VNODE, dict_match, id=vn)
  209. def create_and_submit_job(self, job_type, attribs={}):
  210. """
  211. create the job object and submit it to the server
  212. based on 'job_type' and attributes list 'attribs'.
  213. """
  214. retjob = Job(TEST_USER, attrs=attribs)
  215. if job_type == 'job1':
  216. retjob.create_script(self.script['job1'])
  217. elif job_type == 'job1_1':
  218. retjob.create_script(self.script['job1_1'])
  219. elif job_type == 'job1_2':
  220. retjob.create_script(self.script['job1_2'])
  221. elif job_type == 'job1_3':
  222. retjob.create_script(self.script['job1_3'])
  223. elif job_type == 'job1_5':
  224. retjob.create_script(self.script['job1_5'])
  225. elif job_type == 'job1_6':
  226. retjob.create_script(self.script['job1_6'])
  227. elif job_type == 'job1_extra_res':
  228. retjob.create_script(self.script['job1_extra_res'])
  229. elif job_type == 'job2':
  230. retjob.create_script(self.script['job2'])
  231. elif job_type == 'job3':
  232. retjob.create_script(self.script['job3'])
  233. elif job_type == 'job5':
  234. retjob.create_script(self.script['job5'])
  235. elif job_type == 'job11':
  236. retjob.create_script(self.script['job11'])
  237. elif job_type == 'job11x':
  238. retjob.create_script(self.script['job11x'])
  239. elif job_type == 'job12':
  240. retjob.create_script(self.script['job12'])
  241. elif job_type == 'job13':
  242. retjob.create_script(self.script['job13'])
  243. elif job_type == 'jobA':
  244. retjob.create_script(self.script['jobA'])
  245. return self.server.submit(retjob)
  246. def setUp(self):
  247. if len(self.moms) != 3:
  248. self.skip_test(reason="need 3 mom hosts: -p moms=<m1>:<m2>:<m3>")
  249. TestFunctional.setUp(self)
  250. Job.dflt_attributes[ATTR_k] = 'oe'
  251. self.server.cleanup_jobs(extend="force")
  252. self.momA = self.moms.values()[0]
  253. self.momB = self.moms.values()[1]
  254. self.momC = self.moms.values()[2]
  255. # Now start setting up and creating the vnodes
  256. self.server.manager(MGR_CMD_DELETE, NODE, None, "")
  257. # set node momA
  258. self.hostA = self.momA.shortname
  259. self.momA.delete_vnode_defs()
  260. vnode_prefix = self.hostA
  261. a = {'resources_available.mem': '1gb',
  262. 'resources_available.ncpus': '1'}
  263. vnodedef = self.momA.create_vnode_def(vnode_prefix, a, 4)
  264. self.assertNotEqual(vnodedef, None)
  265. self.momA.insert_vnode_def(vnodedef, 'vnode.def')
  266. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA)
  267. # set node momB
  268. self.hostB = self.momB.shortname
  269. self.momB.delete_vnode_defs()
  270. vnode_prefix = self.hostB
  271. a = {'resources_available.mem': '1gb',
  272. 'resources_available.ncpus': '1'}
  273. vnodedef = self.momB.create_vnode_def(vnode_prefix, a, 5,
  274. usenatvnode=True)
  275. self.assertNotEqual(vnodedef, None)
  276. self.momB.insert_vnode_def(vnodedef, 'vnode.def')
  277. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB)
  278. # set node momC
  279. # This one has no vnode definition.
  280. self.hostC = self.momC.shortname
  281. self.momC.delete_vnode_defs()
  282. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC)
  283. a = {'resources_available.ncpus': 2,
  284. 'resources_available.mem': '2gb'}
  285. # set natural vnode of hostC
  286. self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostC,
  287. expect=True)
  288. a = {'state': 'free', 'resources_available.ncpus': (GE, 1)}
  289. self.server.expect(VNODE, {'state=free': 11}, op=EQ, count=True,
  290. max_attempts=10, interval=2)
  291. # Various node names
  292. self.n0 = self.hostA
  293. self.n1 = '%s[0]' % (self.hostA,)
  294. self.n2 = '%s[1]' % (self.hostA,)
  295. self.n3 = '%s[2]' % (self.hostA,)
  296. self.n4 = self.hostB
  297. self.n5 = '%s[0]' % (self.hostB,)
  298. self.n6 = '%s[1]' % (self.hostB,)
  299. self.n7 = self.hostC
  300. self.n8 = '%s[3]' % (self.hostA,)
  301. self.n9 = '%s[2]' % (self.hostB,)
  302. self.n10 = '%s[3]' % (self.hostB,)
  303. if sys.platform in ('cygwin', 'win32'):
  304. SLEEP_CMD = "pbs-sleep"
  305. else:
  306. SLEEP_CMD = "/bin/sleep"
  307. self.pbs_release_nodes_cmd = os.path.join(
  308. self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes')
  309. FIB40 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
  310. 'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
  311. return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(40)\\\")"'
  312. FIB45 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
  313. 'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
  314. return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(45)\\\")"'
  315. FIB50 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
  316. 'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
  317. return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(50)\\\")"'
  318. FIB400 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
  319. 'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
  320. return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(400)\\\")"'
  321. # job submission arguments
  322. self.script = {}
  323. self.job1_select = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
  324. self.job1_place = "scatter"
  325. # expected values upon successful job submission
  326. self.job1_schedselect = "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+" + \
  327. "1:ncpus=2:mem=2gb"
  328. self.job1_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
  329. self.n0, self.n4, self.n7)
  330. self.job1_exec_vnode = \
  331. "(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
  332. "%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
  333. "%s:ncpus=1)+" % (self.n3) + \
  334. "(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
  335. "%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
  336. "%s:ncpus=1)+" % (self.n6,) + \
  337. "(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
  338. self.job1_sel_esc = self.job1_select.replace("+", "\+")
  339. self.job1_exec_host_esc = self.job1_exec_host.replace(
  340. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  341. self.job1_exec_vnode_esc = self.job1_exec_vnode.replace(
  342. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  343. ")", "\)").replace("+", "\+")
  344. self.job1_newsel = self.transform_select(self.job1_select.split(
  345. '+')[0])
  346. self.job1_new_exec_host = self.job1_exec_host.split('+')[0]
  347. self.job1_new_exec_vnode = self.job1_exec_vnode.split(')')[0] + ')'
  348. self.job1_new_exec_vnode_esc = \
  349. self.job1_new_exec_vnode.replace("[", "\[").replace(
  350. "]", "\]").replace("(", "\(").replace(")", "\)").replace(
  351. "+", "\+")
  352. self.script['job1'] = \
  353. "#PBS -S /bin/bash\n" \
  354. "#PBS -l select=" + self.job1_select + "\n" + \
  355. "#PBS -l place=" + self.job1_place + "\n" + \
  356. "#PBS -W stageout=test.img@%s:test.img\n" % (self.n4,) + \
  357. "#PBS -W release_nodes_on_stageout=true\n" + \
  358. "dd if=/dev/zero of=test.img count=1024 bs=1048576\n" + \
  359. "pbsdsh -n 1 -- %s\n" % (FIB40,) + \
  360. "pbsdsh -n 2 -- %s\n" % (FIB40,) + \
  361. "%s\n" % (FIB50,)
  362. self.script['job1_1'] = \
  363. "#PBS -S /bin/bash\n" \
  364. "#PBS -l select=" + self.job1_select + "\n" + \
  365. "#PBS -l place=" + self.job1_place + "\n" + \
  366. "#PBS -W stageout=test.img@%s:test.img\n" % (self.n4,) + \
  367. "#PBS -W release_nodes_on_stageout=false\n" + \
  368. "dd if=/dev/zero of=test.img count=1024 bs=1048576\n" + \
  369. "pbsdsh -n 1 -- %s\n" % (FIB40,) + \
  370. "pbsdsh -n 2 -- %s\n" % (FIB40,) + \
  371. "%s\n" % (FIB50,)
  372. self.script['job1_2'] = \
  373. "#PBS -S /bin/bash\n" \
  374. "#PBS -l select=" + self.job1_select + "\n" + \
  375. "#PBS -l place=" + self.job1_place + "\n" + \
  376. "#PBS -W stageout=test.img@%s:test.img\n" % (self.n4,) + \
  377. "dd if=/dev/zero of=test.img count=1024 bs=1048576\n" + \
  378. "pbsdsh -n 1 -- %s\n" % (FIB40,) + \
  379. "pbsdsh -n 2 -- %s\n" % (FIB40,) + \
  380. "%s\n" % (FIB50,)
  381. self.script['job1_3'] = \
  382. "#PBS -S /bin/bash\n" \
  383. "#PBS -l select=" + self.job1_select + "\n" + \
  384. "#PBS -l place=" + self.job1_place + "\n" + \
  385. SLEEP_CMD + " 5\n" + \
  386. "pbs_release_nodes -a\n" + \
  387. "%s\n" % (FIB50,)
  388. self.script['job1_5'] = \
  389. "#PBS -S /bin/bash\n" \
  390. "#PBS -l select=" + self.job1_select + "\n" + \
  391. "#PBS -l place=" + self.job1_place + "\n" + \
  392. "pbsdsh -n 1 -- %s &\n" % (FIB45,) + \
  393. "pbsdsh -n 2 -- %s &\n" % (FIB45,) + \
  394. "%s\n" % (FIB45,)
  395. self.script['jobA'] = \
  396. "#PBS -S /bin/bash\n" \
  397. "#PBS -l select=" + self.job1_select + "\n" + \
  398. "#PBS -l place=" + self.job1_place + "\n" + \
  399. "#PBS -J 1-5\n"\
  400. "pbsdsh -n 1 -- %s &\n" % (FIB45,) + \
  401. "pbsdsh -n 2 -- %s &\n" % (FIB45,) + \
  402. "%s\n" % (FIB45,)
  403. self.script['job1_6'] = \
  404. "#PBS -S /bin/bash\n" \
  405. "#PBS -l select=" + self.job1_select + "\n" + \
  406. "#PBS -l place=" + self.job1_place + "\n" + \
  407. SLEEP_CMD + " 5\n" + \
  408. self.pbs_release_nodes_cmd + " " + self.n4 + "\n" + \
  409. "%s\n" % (FIB50,)
  410. self.job1_extra_res_select = \
  411. "ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
  412. "ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
  413. "ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  414. self.job1_extra_res_place = "scatter"
  415. self.job1_extra_res_schedselect = \
  416. "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
  417. "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
  418. "1:ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  419. self.job1_extra_res_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
  420. self.n0, self.n4, self.n7)
  421. self.job1_extra_res_exec_vnode = \
  422. "(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
  423. "%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
  424. "%s:ncpus=1)+" % (self.n3,) + \
  425. "(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
  426. "%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
  427. "%s:ncpus=1)+" % (self.n6,) + \
  428. "(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
  429. self.script['job1_extra_res'] = \
  430. "#PBS -S /bin/bash\n" \
  431. "#PBS -l select=" + self.job1_extra_res_select + "\n" + \
  432. "#PBS -l place=" + self.job1_extra_res_place + "\n" + \
  433. "pbsdsh -n 1 -- %s &\n" % (FIB40,) + \
  434. "pbsdsh -n 2 -- %s &\n" % (FIB40,) + \
  435. "%s\n" % (FIB50,)
  436. self.job2_select = "ncpus=1:mem=1gb+ncpus=4:mem=4gb+ncpus=2:mem=2gb"
  437. self.job2_place = "scatter"
  438. self.job2_schedselect = "1:ncpus=1:mem=1gb+1:ncpus=4:mem=4gb+" + \
  439. "1:ncpus=2:mem=2gb"
  440. self.job2_exec_host = "%s/1+%s/1*0+%s/1*2" % (
  441. self.n0, self.n4, self.n7)
  442. self.job2_exec_vnode = \
  443. "(%s:ncpus=1:mem=1048576kb)+" % (self.n8,) + \
  444. "(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
  445. "%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
  446. "%s:mem=1048576kb:ncpus=1+" % (self.n9,) + \
  447. "%s:mem=1048576kb:ncpus=1)+" % (self.n10,) + \
  448. "(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
  449. self.job2_exec_vnode_var1 = \
  450. "(%s:ncpus=1:mem=1048576kb)+" % (self.n8,) + \
  451. "(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
  452. "%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
  453. "%s:mem=1048576kb:ncpus=1+" % (self.n6,) + \
  454. "%s:mem=1048576kb:ncpus=1)+" % (self.n9,) + \
  455. "(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
  456. self.script['job2'] = \
  457. "#PBS -l select=" + self.job2_select + "\n" + \
  458. "#PBS -l place=" + self.job2_place + "\n" + \
  459. SLEEP_CMD + " 60\n"
  460. self.script['job3'] = \
  461. "#PBS -l select=vnode=" + self.n4 + "+vnode=" + self.n0 + \
  462. ":mem=4mb\n" + SLEEP_CMD + " 30\n"
  463. self.script['job5'] = \
  464. "#PBS -l select=vnode=" + self.n0 + ":mem=4mb\n" + \
  465. SLEEP_CMD + " 300\n"
  466. self.job11x_select = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=1:mem=1gb"
  467. self.job11x_place = "scatter:excl"
  468. self.job11x_schedselect = "1:ncpus=3:mem=2gb+" + \
  469. "1:ncpus=3:mem=2gb+1:ncpus=1:mem=1gb"
  470. self.job11x_exec_host = "%s/0*0+%s/0*0+%s/0" % (
  471. self.n0, self.n4, self.n7)
  472. self.job11x_exec_vnode = \
  473. "(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
  474. "%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
  475. "%s:ncpus=1)+" % (self.n3,) + \
  476. "(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
  477. "%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
  478. "%s:ncpus=1)+" % (self.n6,) + \
  479. "(%s:ncpus=1:mem=1048576kb)" % (self.n7,)
  480. self.script['job11x'] = \
  481. "#PBS -S /bin/bash\n" \
  482. "#PBS -l select=" + self.job11x_select + "\n" + \
  483. "#PBS -l place=" + self.job11x_place + "\n" + \
  484. "pbsdsh -n 1 -- %s\n" % (FIB40,) + \
  485. "pbsdsh -n 2 -- %s\n" % (FIB40,) + \
  486. "%s\n" % (FIB50,)
  487. self.job11_select = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=1:mem=1gb"
  488. self.job11_place = "scatter"
  489. self.job11_schedselect = "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+" + \
  490. "1:ncpus=1:mem=1gb"
  491. self.job11_exec_host = "%s/0*0+%s/0*0+%s/0" % (
  492. self.n0, self.n4, self.n7)
  493. self.job11_exec_vnode = \
  494. "(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
  495. "%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
  496. "%s:ncpus=1)+" % (self.n3,) + \
  497. "(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
  498. "%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
  499. "%s:ncpus=1)+" % (self.n6,) + \
  500. "(%s:ncpus=1:mem=1048576kb)" % (self.n7,)
  501. self.script['job11'] = \
  502. "#PBS -S /bin/bash\n" \
  503. "#PBS -l select=" + self.job11_select + "\n" + \
  504. "#PBS -l place=" + self.job11_place + "\n" + \
  505. "pbsdsh -n 1 -- %s\n" % (FIB40,) + \
  506. "pbsdsh -n 2 -- %s\n" % (FIB40,) + \
  507. "%s\n" % (FIB50,)
  508. self.job12_select = "vnode=%s:ncpus=1:mem=1gb" % (self.n7,)
  509. self.job12_schedselect = "1:vnode=%s:ncpus=1:mem=1gb" % (self.n7,)
  510. self.job12_place = "free"
  511. self.job12_exec_host = "%s/1" % (self.n7,)
  512. self.job12_exec_vnode = "(%s:ncpus=1:mem=1048576kb)" % (self.n7,)
  513. self.script['job12'] = \
  514. "#PBS -l select=" + self.job12_select + "\n" + \
  515. "#PBS -l place=" + self.job12_place + "\n" + \
  516. SLEEP_CMD + " 60\n"
  517. self.job13_select = "3:ncpus=1"
  518. self.script['job13'] = \
  519. "#PBS -S /bin/bash\n" \
  520. "#PBS -l select=" + self.job13_select + "\n" + \
  521. "#PBS -l place=" + self.job1_place + "\n" + \
  522. "pbsdsh -n 1 -- %s\n" % (FIB400,) + \
  523. "pbsdsh -n 2 -- %s\n" % (FIB400,) + \
  524. "pbsdsh -n 3 -- %s\n" % (FIB400,)
  525. def tearDown(self):
  526. self.momA.signal("-CONT")
  527. self.momB.signal("-CONT")
  528. self.momC.signal("-CONT")
  529. TestFunctional.tearDown(self)
  530. # Delete managers and operators if added
  531. attrib = ['operators', 'managers']
  532. self.server.manager(MGR_CMD_UNSET, SERVER, attrib, expect=True)
  533. def test_release_nodes_on_stageout_true(self):
  534. """
  535. Test:
  536. qsub -W release_nodes_on_stageout=true job.script
  537. where job.script specifies a select spec of
  538. 2 super-chunks of ncpus=3 and mem=2gb each,
  539. and 1 chunk of ncpus=2 and mem=2gb, along with
  540. place spec of "scatter".
  541. With release_nodes_on_stageout=true option, when
  542. job is deleted and runs a lengthy stageout process,
  543. only the primary execution host's
  544. vnodes are left assigned to the job.
  545. """
  546. # Inside job1's script contains the
  547. # directive to release_nodes_on_stageout=true
  548. jid = self.create_and_submit_job('job1')
  549. self.server.expect(JOB, {'job_state': 'R',
  550. 'release_nodes_on_stageout': 'True',
  551. 'Resource_List.mem': '6gb',
  552. 'Resource_List.ncpus': 8,
  553. 'Resource_List.nodect': 3,
  554. 'Resource_List.select': self.job1_select,
  555. 'Resource_List.place': self.job1_place,
  556. 'schedselect': self.job1_schedselect,
  557. 'exec_host': self.job1_exec_host,
  558. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  559. # server's license_count used value matches job's 'ncpus' value.
  560. self.license_count_match(8)
  561. # Check various vnode status.
  562. jobs_assn1 = "%s/0" % (jid,)
  563. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  564. 'job-busy', jobs_assn1, 1, '1048576kb')
  565. self.match_vnode_status([self.n3, self.n6],
  566. 'job-busy', jobs_assn1, 1, '0kb')
  567. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  568. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  569. 2, '2097152kb')
  570. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  571. self.assertTrue(
  572. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  573. # Deleting the job will trigger the stageout process
  574. # at which time sister nodes are automatically released
  575. # due to release_nodes_stageout=true set
  576. self.server.delete(jid)
  577. # Verify mom_logs
  578. self.momA.log_match(
  579. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n4), n=10,
  580. max_attempts=18, interval=2, regexp=True)
  581. self.momA.log_match(
  582. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n7), n=10,
  583. max_attempts=18, interval=2, regexp=True)
  584. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  585. max_attempts=18, interval=2)
  586. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  587. max_attempts=18, interval=2)
  588. # Verify remaining job resources.
  589. self.server.expect(JOB, {'job_state': 'E',
  590. 'Resource_List.mem': '2gb',
  591. 'Resource_List.ncpus': 3,
  592. 'Resource_List.select': self.job1_newsel,
  593. 'Resource_List.place': self.job1_place,
  594. 'Resource_List.nodect': 1,
  595. 'schedselect': self.job1_newsel,
  596. 'exec_host': self.job1_new_exec_host,
  597. 'exec_vnode': self.job1_new_exec_vnode},
  598. id=jid)
  599. # server's license_count used value matches job's 'ncpus' value.
  600. self.license_count_match(3)
  601. # Check various vnode status
  602. self.match_vnode_status([self.n1, self.n2],
  603. 'job-busy', jobs_assn1, 1, '1048576kb')
  604. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  605. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  606. self.n7, self.n8, self.n9, self.n10], 'free')
  607. self.assertTrue(
  608. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  609. # Check account update ('u') record
  610. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  611. self.job1_exec_vnode_esc, "6gb", 8, 3,
  612. self.job1_place,
  613. self.job1_sel_esc)
  614. # Check to make sure 'c' (next) record got generated
  615. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  616. self.job1_new_exec_vnode_esc, "2097152kb",
  617. 3, 1, self.job1_place, self.job1_newsel)
  618. def test_release_nodes_on_stageout_false(self):
  619. """
  620. Test:
  621. qsub -W release_nodes_on_stageout=False job.script
  622. where job.script specifies a select spec of
  623. 2 super-chunks of ncpus=3 and mem=2gb each,
  624. and 1 chunk of ncpus=2 and mem=2gb, along with
  625. place spec of "scatter".
  626. With release_nodes_on_stageout=false option, when job is
  627. deleted and runs a lengthy stageout process, nothing
  628. changes in job's vnodes assignment.
  629. """
  630. # Inside job1_1's script contains the
  631. # directive to release_nodes_on_stageout=false
  632. jid = self.create_and_submit_job('job1_1')
  633. self.server.expect(JOB, {'job_state': 'R',
  634. 'release_nodes_on_stageout': 'False',
  635. 'Resource_List.mem': '6gb',
  636. 'Resource_List.ncpus': 8,
  637. 'Resource_List.nodect': 3,
  638. 'Resource_List.select': self.job1_select,
  639. 'Resource_List.place': self.job1_place,
  640. 'schedselect': self.job1_schedselect,
  641. 'exec_host': self.job1_exec_host,
  642. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  643. # server's license_count used value matches job's 'ncpus' value.
  644. self.license_count_match(8)
  645. # Check various vnode status.
  646. jobs_assn1 = "%s/0" % (jid,)
  647. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  648. 'job-busy', jobs_assn1, 1, '1048576kb')
  649. jobs_assn1 = "%s/0" % (jid,)
  650. self.match_vnode_status([self.n3, self.n6],
  651. 'job-busy', jobs_assn1, 1, '0kb')
  652. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  653. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  654. 2, '2097152kb')
  655. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  656. # Deleting a job should not trigger automatic
  657. # release of nodes due to release_nodes_stagout=False
  658. self.server.delete(jid)
  659. # Verify mom_logs
  660. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  661. max_attempts=5, interval=1,
  662. existence=False)
  663. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  664. max_attempts=5, interval=1,
  665. existence=False)
  666. # Verify no change in remaining job resources.
  667. self.server.expect(JOB, {'job_state': 'E',
  668. 'Resource_List.mem': '6gb',
  669. 'Resource_List.ncpus': 8,
  670. 'Resource_List.nodect': 3,
  671. 'Resource_List.select': self.job1_select,
  672. 'Resource_List.place': self.job1_place,
  673. 'schedselect': self.job1_schedselect,
  674. 'exec_host': self.job1_exec_host,
  675. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  676. # server's license_count used value matches job's 'ncpus' value.
  677. self.license_count_match(8)
  678. # Check various vnode status.
  679. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  680. 'job-busy', jobs_assn1, 1, '1048576kb')
  681. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  682. jobs_assn1, 1, '0kb')
  683. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  684. 2, '2097152kb')
  685. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  686. # Check for no existence of account update ('u') record
  687. self.server.accounting_match(
  688. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  689. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  690. # Check for no existence of account next ('c') record
  691. self.server.accounting_match(
  692. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  693. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  694. def test_release_nodes_on_stageout_default(self):
  695. """
  696. Test:
  697. qsub: no -Wrelease_nodes_on_stageout
  698. option given.
  699. Job runs as normal.
  700. """
  701. jid = self.create_and_submit_job('job1_2')
  702. self.server.expect(JOB, {'job_state': 'R',
  703. 'Resource_List.mem': '6gb',
  704. 'Resource_List.ncpus': 8,
  705. 'Resource_List.nodect': 3,
  706. 'Resource_List.select': self.job1_select,
  707. 'Resource_List.place': self.job1_place,
  708. 'schedselect': self.job1_schedselect,
  709. 'exec_host': self.job1_exec_host,
  710. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  711. # server's license_count used value matches job's 'ncpus' value.
  712. self.license_count_match(8)
  713. # Check various vnode status.
  714. jobs_assn1 = "%s/0" % (jid,)
  715. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  716. 'job-busy', jobs_assn1, 1, '1048576kb')
  717. self.match_vnode_status([self.n3, self.n6],
  718. 'job-busy', jobs_assn1, 1, '0kb')
  719. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  720. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  721. 2, '2097152kb')
  722. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  723. self.server.delete(jid)
  724. # Verify mom_logs
  725. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  726. max_attempts=5, interval=1,
  727. existence=False)
  728. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  729. max_attempts=5, interval=1,
  730. existence=False)
  731. # Verify no change in remaining job resources.
  732. self.server.expect(JOB, {'job_state': 'E',
  733. 'Resource_List.mem': '6gb',
  734. 'Resource_List.ncpus': 8,
  735. 'Resource_List.nodect': 3,
  736. 'Resource_List.select': self.job1_select,
  737. 'Resource_List.place': self.job1_place,
  738. 'schedselect': self.job1_schedselect,
  739. 'exec_host': self.job1_exec_host,
  740. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  741. # server's license_count used value matches job's 'ncpus' value.
  742. self.license_count_match(8)
  743. # Check various vnode status.
  744. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  745. 'job-busy', jobs_assn1, 1, '1048576kb')
  746. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  747. jobs_assn1, 1, '0kb')
  748. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  749. 2, '2097152kb')
  750. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10],
  751. 'free')
  752. # Check for no existence of account update ('u') record
  753. self.server.accounting_match(
  754. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  755. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  756. # Check for no existence of account next ('c') record
  757. self.server.accounting_match(
  758. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  759. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  760. def test_release_nodes_on_stageout_true_qalter(self):
  761. """
  762. Test:
  763. qalter -W release_nodes_on_stageout=true.
  764. After running job is modified by qalter,
  765. with release_nodes_on_stageout=true option, when
  766. job is deleted and runs a lengthy stageout process,
  767. only the primary execution host's
  768. vnodes are left assigned to the job.
  769. """
  770. jid = self.create_and_submit_job('job1_2')
  771. self.server.expect(JOB, {'job_state': 'R',
  772. 'Resource_List.mem': '6gb',
  773. 'Resource_List.ncpus': 8,
  774. 'Resource_List.nodect': 3,
  775. 'Resource_List.select': self.job1_select,
  776. 'Resource_List.place': self.job1_place,
  777. 'schedselect': self.job1_schedselect,
  778. 'exec_host': self.job1_exec_host,
  779. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  780. # server's license_count used value matches job's 'ncpus' value.
  781. self.license_count_match(8)
  782. # run qalter -Wrelease_nodes_on_stageout=true
  783. self.server.alterjob(jid,
  784. {ATTR_W: 'release_nodes_on_stageout=true'})
  785. self.server.expect(JOB, {'release_nodes_on_stageout': 'True'}, id=jid)
  786. # Check various vnode status.
  787. jobs_assn1 = "%s/0" % (jid,)
  788. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  789. 'job-busy', jobs_assn1, 1, '1048576kb')
  790. self.match_vnode_status([self.n3, self.n6],
  791. 'job-busy', jobs_assn1, 1, '0kb')
  792. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  793. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  794. 2, '2097152kb')
  795. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  796. # This triggers the lengthy stageout process
  797. self.server.delete(jid)
  798. # Verify mom_logs
  799. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  800. jid, self.hostB), n=10,
  801. max_attempts=18, interval=2, regexp=True)
  802. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  803. jid, self.hostC), n=10,
  804. max_attempts=18, interval=2, regexp=True)
  805. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  806. max_attempts=18, interval=2)
  807. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  808. max_attempts=18, interval=2)
  809. self.server.expect(JOB, {'job_state': 'E',
  810. 'Resource_List.mem': '2gb',
  811. 'Resource_List.ncpus': 3,
  812. 'Resource_List.select': self.job1_newsel,
  813. 'Resource_List.place': self.job1_place,
  814. 'Resource_List.nodect': 1,
  815. 'schedselect': self.job1_newsel,
  816. 'exec_host': self.job1_new_exec_host,
  817. 'exec_vnode': self.job1_new_exec_vnode},
  818. id=jid)
  819. # server's license_count used value matches job's 'ncpus' value.
  820. self.license_count_match(3)
  821. # Check various vnode status
  822. self.match_vnode_status([self.n1, self.n2],
  823. 'job-busy', jobs_assn1, 1, '1048576kb')
  824. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1,
  825. 1, '0kb')
  826. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  827. self.n7, self.n8, self.n9, self.n10], 'free')
  828. self.assertTrue(
  829. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  830. # Check account update ('u') record
  831. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  832. self.job1_exec_vnode_esc, "6gb", 8, 3,
  833. self.job1_place,
  834. self.job1_sel_esc)
  835. # Check to make sure 'c' (next) record got generated
  836. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  837. self.job1_new_exec_vnode_esc, "2097152kb",
  838. 3, 1, self.job1_place, self.job1_newsel)
  839. def test_release_nodes_on_stageout_false_qalter(self):
  840. """
  841. Test:
  842. qalter -W release_nodes_on_stageout=False.
  843. After running job is modified by qalter,
  844. With release_nodes_on_stageout=false option, when job is
  845. deleted and runs a lengthy stageout process, nothing
  846. changes in job's vnodes assignment.
  847. """
  848. jid = self.create_and_submit_job('job1_2')
  849. self.server.expect(JOB, {'job_state': 'R',
  850. 'Resource_List.mem': '6gb',
  851. 'Resource_List.ncpus': 8,
  852. 'Resource_List.nodect': 3,
  853. 'Resource_List.select': self.job1_select,
  854. 'Resource_List.place': self.job1_place,
  855. 'schedselect': self.job1_schedselect,
  856. 'exec_host': self.job1_exec_host,
  857. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  858. # server's license_count used value matches job's 'ncpus' value.
  859. self.license_count_match(8)
  860. # run qalter -Wrelease_nodes_on_stageout=true
  861. self.server.alterjob(jid,
  862. {ATTR_W: 'release_nodes_on_stageout=false'})
  863. self.server.expect(JOB, {'release_nodes_on_stageout': 'False'}, id=jid)
  864. # Check various vnode status.
  865. jobs_assn1 = "%s/0" % (jid,)
  866. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  867. 'job-busy', jobs_assn1, 1, '1048576kb')
  868. self.match_vnode_status([self.n3, self.n6],
  869. 'job-busy', jobs_assn1, 1, '0kb')
  870. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  871. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  872. 2, '2097152kb')
  873. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  874. # This triggers long stageout process
  875. self.server.delete(jid)
  876. # Verify mom_logs
  877. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  878. max_attempts=5, interval=1,
  879. existence=False)
  880. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  881. max_attempts=5, interval=1,
  882. existence=False)
  883. # Verify no change in remaining job resources.
  884. self.server.expect(JOB, {'job_state': 'E',
  885. 'Resource_List.mem': '6gb',
  886. 'Resource_List.ncpus': 8,
  887. 'Resource_List.nodect': 3,
  888. 'Resource_List.select': self.job1_select,
  889. 'Resource_List.place': self.job1_place,
  890. 'schedselect': self.job1_schedselect,
  891. 'exec_host': self.job1_exec_host,
  892. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  893. # server's license_count used value matches job's 'ncpus' value.
  894. self.license_count_match(8)
  895. # Check various vnode status.
  896. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  897. 'job-busy', jobs_assn1, 1, '1048576kb')
  898. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  899. jobs_assn1, 1, '0kb')
  900. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  901. 2, '2097152kb')
  902. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  903. # Check for no existence of account update ('u') record
  904. self.server.accounting_match(
  905. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  906. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  907. # Check for no existence of account next ('c') record
  908. self.server.accounting_match(
  909. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  910. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  911. def test_hook_release_nodes_on_stageout_true(self):
  912. """
  913. Test:
  914. Using a queuejob hook to set
  915. release_nodes_on_stageout=true.
  916. When job is deleted and runs a
  917. lengthy stageout process, only
  918. the primary execution host's
  919. vnodes are left assigned to the job.
  920. """
  921. hook_body = """
  922. import pbs
  923. pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
  924. pbs.event().job.release_nodes_on_stageout=True
  925. """
  926. hook_event = "queuejob"
  927. hook_name = "qjob"
  928. a = {'event': hook_event, 'enabled': 'true'}
  929. self.server.create_import_hook(hook_name, a, hook_body)
  930. jid = self.create_and_submit_job('job1_2')
  931. self.server.log_match("queuejob hook executed", n=20,
  932. max_attempts=25, interval=2)
  933. self.server.expect(JOB, {'job_state': 'R',
  934. 'release_nodes_on_stageout': 'True',
  935. 'Resource_List.mem': '6gb',
  936. 'Resource_List.ncpus': 8,
  937. 'Resource_List.nodect': 3,
  938. 'Resource_List.select': self.job1_select,
  939. 'Resource_List.place': self.job1_place,
  940. 'schedselect': self.job1_schedselect,
  941. 'exec_host': self.job1_exec_host,
  942. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  943. # server's license_count used value matches job's 'ncpus' value.
  944. self.license_count_match(8)
  945. # Check various vnode status.
  946. jobs_assn1 = "%s/0" % (jid,)
  947. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  948. 'job-busy', jobs_assn1, 1, '1048576kb')
  949. self.match_vnode_status([self.n3, self.n6],
  950. 'job-busy', jobs_assn1, 1, '0kb')
  951. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  952. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  953. 2, '2097152kb')
  954. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  955. self.assertTrue(
  956. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  957. # Deleting the job will trigger the stageout process
  958. # at which time sister nodes are automatically released
  959. # due to release_nodes_stageout=true set
  960. self.server.delete(jid)
  961. # Verify mom_logs
  962. self.momA.log_match(
  963. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n4,), n=10,
  964. max_attempts=18, interval=2, regexp=True)
  965. self.momA.log_match(
  966. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  967. max_attempts=18, interval=2, regexp=True)
  968. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  969. max_attempts=18, interval=2)
  970. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  971. max_attempts=18, interval=2)
  972. # Verify remaining job resources.
  973. self.server.expect(JOB, {'job_state': 'E',
  974. 'Resource_List.mem': '2gb',
  975. 'Resource_List.ncpus': 3,
  976. 'Resource_List.select': self.job1_newsel,
  977. 'Resource_List.place': self.job1_place,
  978. 'Resource_List.nodect': 1,
  979. 'schedselect': self.job1_newsel,
  980. 'exec_host': self.job1_new_exec_host,
  981. 'exec_vnode': self.job1_new_exec_vnode},
  982. id=jid)
  983. # server's license_count used value matches job's 'ncpus' value.
  984. self.license_count_match(3)
  985. # Check various vnode status
  986. self.match_vnode_status([self.n1, self.n2],
  987. 'job-busy', jobs_assn1, 1, '1048576kb')
  988. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  989. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  990. self.n7, self.n8, self.n9, self.n10], 'free')
  991. self.assertTrue(
  992. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  993. # Check account update ('u') record
  994. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  995. self.job1_exec_vnode_esc, "6gb", 8, 3,
  996. self.job1_place,
  997. self.job1_sel_esc)
  998. # Check to make sure 'c' (next) record got generated
  999. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  1000. self.job1_new_exec_vnode_esc, "2097152kb",
  1001. 3, 1, self.job1_place, self.job1_newsel)
  1002. def test_hook_release_nodes_on_stageout_false(self):
  1003. """
  1004. Test:
  1005. Using a queuejob hook to set
  1006. -Wrelease_nodes_on_stageout=False.
  1007. When job is deleted and runs a
  1008. lengthy stageout process, nothing
  1009. changes in job's vnodes assignment.
  1010. """
  1011. hook_body = """
  1012. import pbs
  1013. pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
  1014. pbs.event().job.release_nodes_on_stageout=False
  1015. """
  1016. hook_event = "queuejob"
  1017. hook_name = "qjob"
  1018. a = {'event': hook_event, 'enabled': 'true'}
  1019. self.server.create_import_hook(hook_name, a, hook_body)
  1020. jid = self.create_and_submit_job('job1_2')
  1021. self.server.log_match("queuejob hook executed", n=20,
  1022. max_attempts=25, interval=2)
  1023. self.server.expect(JOB, {'job_state': 'R',
  1024. 'release_nodes_on_stageout': 'False',
  1025. 'Resource_List.mem': '6gb',
  1026. 'Resource_List.ncpus': 8,
  1027. 'Resource_List.nodect': 3,
  1028. 'Resource_List.select': self.job1_select,
  1029. 'Resource_List.place': self.job1_place,
  1030. 'schedselect': self.job1_schedselect,
  1031. 'exec_host': self.job1_exec_host,
  1032. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1033. # server's license_count used value matches job's 'ncpus' value.
  1034. self.license_count_match(8)
  1035. # Check various vnode status.
  1036. jobs_assn1 = "%s/0" % (jid,)
  1037. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1038. 'job-busy', jobs_assn1, 1, '1048576kb')
  1039. jobs_assn1 = "%s/0" % (jid,)
  1040. self.match_vnode_status([self.n3, self.n6],
  1041. 'job-busy', jobs_assn1, 1, '0kb')
  1042. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1043. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1044. 2, '2097152kb')
  1045. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1046. # Deleting a job should not trigger automatic
  1047. # release of nodes due to release_nodes_stagout=False
  1048. self.server.delete(jid)
  1049. # Verify mom_logs
  1050. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1051. max_attempts=5, interval=1,
  1052. existence=False)
  1053. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1054. max_attempts=5, interval=1,
  1055. existence=False)
  1056. # Verify no change in remaining job resources.
  1057. self.server.expect(JOB, {'job_state': 'E',
  1058. 'Resource_List.mem': '6gb',
  1059. 'Resource_List.ncpus': 8,
  1060. 'Resource_List.nodect': 3,
  1061. 'Resource_List.select': self.job1_select,
  1062. 'Resource_List.place': self.job1_place,
  1063. 'schedselect': self.job1_schedselect,
  1064. 'exec_host': self.job1_exec_host,
  1065. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1066. # server's license_count used value matches job's 'ncpus' value.
  1067. self.license_count_match(8)
  1068. # Check various vnode status.
  1069. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1070. 'job-busy', jobs_assn1, 1, '1048576kb')
  1071. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  1072. jobs_assn1, 1, '0kb')
  1073. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1074. 2, '2097152kb')
  1075. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1076. # Check for no existence of account update ('u') record
  1077. self.server.accounting_match(
  1078. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  1079. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1080. # Check for no existence of account next ('c') record
  1081. self.server.accounting_match(
  1082. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  1083. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1084. def test_hook2_release_nodes_on_stageout_true(self):
  1085. """
  1086. Test:
  1087. Using a modifyjob hook to set
  1088. release_nodes_on_stageout=true.
  1089. When job is deleted and runs a
  1090. lengthy stageout process, only
  1091. the primary execution host's
  1092. vnodes are left assigned to the job.
  1093. """
  1094. hook_body = """
  1095. import pbs
  1096. pbs.logmsg(pbs.LOG_DEBUG, "modifyjob hook executed")
  1097. pbs.event().job.release_nodes_on_stageout=True
  1098. """
  1099. hook_event = "modifyjob"
  1100. hook_name = "mjob"
  1101. a = {'event': hook_event, 'enabled': 'true'}
  1102. self.server.create_import_hook(hook_name, a, hook_body)
  1103. jid = self.create_and_submit_job('job1_2')
  1104. self.server.expect(JOB, {'job_state': 'R',
  1105. 'Resource_List.mem': '6gb',
  1106. 'Resource_List.ncpus': 8,
  1107. 'Resource_List.nodect': 3,
  1108. 'Resource_List.select': self.job1_select,
  1109. 'Resource_List.place': self.job1_place,
  1110. 'schedselect': self.job1_schedselect,
  1111. 'exec_host': self.job1_exec_host,
  1112. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1113. # server's license_count used value matches job's 'ncpus' value.
  1114. self.license_count_match(8)
  1115. # Check various vnode status.
  1116. jobs_assn1 = "%s/0" % (jid,)
  1117. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1118. 'job-busy', jobs_assn1, 1, '1048576kb')
  1119. self.match_vnode_status([self.n3, self.n6],
  1120. 'job-busy', jobs_assn1, 1, '0kb')
  1121. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1122. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1123. 2, '2097152kb')
  1124. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1125. # This triggers the modifyjob hook
  1126. self.server.alterjob(jid, {ATTR_N: "test"})
  1127. self.server.log_match("modifyjob hook executed", n=100,
  1128. max_attempts=25, interval=2)
  1129. self.server.expect(JOB, {'release_nodes_on_stageout': 'True'}, id=jid)
  1130. self.assertTrue(
  1131. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1132. # Deleting the job will trigger the stageout process
  1133. # at which time sister nodes are automatically released
  1134. # due to release_nodes_stageout=true set
  1135. self.server.delete(jid)
  1136. # Verify mom_logs
  1137. self.momA.log_match(
  1138. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  1139. max_attempts=18, interval=2, regexp=True)
  1140. self.momA.log_match(
  1141. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  1142. max_attempts=18, interval=2, regexp=True)
  1143. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1144. max_attempts=18, interval=2)
  1145. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1146. max_attempts=18, interval=2)
  1147. # Verify remaining job resources.
  1148. self.server.expect(JOB, {'job_state': 'E',
  1149. 'Resource_List.mem': '2gb',
  1150. 'Resource_List.ncpus': 3,
  1151. 'Resource_List.select': self.job1_newsel,
  1152. 'Resource_List.place': self.job1_place,
  1153. 'Resource_List.nodect': 1,
  1154. 'schedselect': self.job1_newsel,
  1155. 'exec_host': self.job1_new_exec_host,
  1156. 'exec_vnode': self.job1_new_exec_vnode},
  1157. id=jid)
  1158. # server's license_count used value matches job's 'ncpus' value.
  1159. self.license_count_match(3)
  1160. # Check various vnode status.
  1161. self.match_vnode_status([self.n1, self.n2],
  1162. 'job-busy', jobs_assn1, 1, '1048576kb')
  1163. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  1164. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  1165. self.n7, self.n8, self.n9, self.n10], 'free')
  1166. self.assertTrue(
  1167. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  1168. # Check account update ('u') record
  1169. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  1170. self.job1_exec_vnode_esc, "6gb", 8, 3,
  1171. self.job1_place,
  1172. self.job1_sel_esc)
  1173. # Check to make sure 'c' (next) record got generated
  1174. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  1175. self.job1_new_exec_vnode_esc, "2097152kb",
  1176. 3, 1, self.job1_place, self.job1_newsel)
  1177. def test_hook2_release_nodes_on_stageout_false(self):
  1178. """
  1179. Test:
  1180. Using a modifyjob hook to set
  1181. release_nodes_on_stageout=False.
  1182. When job is deleted and runs a
  1183. lengthy stageout process, nothing
  1184. changes in job's vnodes assignment.
  1185. """
  1186. hook_body = """
  1187. import pbs
  1188. pbs.logmsg(pbs.LOG_DEBUG, "modifyjob hook executed")
  1189. pbs.event().job.release_nodes_on_stageout=False
  1190. """
  1191. hook_event = "modifyjob"
  1192. hook_name = "mjob"
  1193. a = {'event': hook_event, 'enabled': 'true'}
  1194. self.server.create_import_hook(hook_name, a, hook_body)
  1195. jid = self.create_and_submit_job('job1_2')
  1196. self.server.expect(JOB, {'job_state': 'R',
  1197. 'Resource_List.mem': '6gb',
  1198. 'Resource_List.ncpus': 8,
  1199. 'Resource_List.nodect': 3,
  1200. 'Resource_List.select': self.job1_select,
  1201. 'Resource_List.place': self.job1_place,
  1202. 'schedselect': self.job1_schedselect,
  1203. 'exec_host': self.job1_exec_host,
  1204. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1205. # server's license_count used value matches job's 'ncpus' value.
  1206. self.license_count_match(8)
  1207. # Check various vnode status.
  1208. jobs_assn1 = "%s/0" % (jid,)
  1209. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1210. 'job-busy', jobs_assn1, 1, '1048576kb')
  1211. self.match_vnode_status([self.n3, self.n6],
  1212. 'job-busy', jobs_assn1, 1, '0kb')
  1213. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1214. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1215. 2, '2097152kb')
  1216. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1217. # This triggers the modifyjob hook
  1218. self.server.alterjob(jid, {ATTR_N: "test"})
  1219. self.server.log_match("modifyjob hook executed", n=100,
  1220. max_attempts=25, interval=2)
  1221. self.server.expect(JOB, {'release_nodes_on_stageout': 'False'}, id=jid)
  1222. # Deleting a job should not trigger automatic
  1223. # release of nodes due to release_nodes_stagout=False
  1224. self.server.delete(jid)
  1225. # Verify mom_logs
  1226. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1227. max_attempts=5, interval=1,
  1228. existence=False)
  1229. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1230. max_attempts=5, interval=1,
  1231. existence=False)
  1232. # Verify no change in remaining job resources.
  1233. self.server.expect(JOB, {'job_state': 'E',
  1234. 'Resource_List.mem': '6gb',
  1235. 'Resource_List.ncpus': 8,
  1236. 'Resource_List.nodect': 3,
  1237. 'Resource_List.select': self.job1_select,
  1238. 'Resource_List.place': self.job1_place,
  1239. 'schedselect': self.job1_schedselect,
  1240. 'exec_host': self.job1_exec_host,
  1241. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1242. # server's license_count used value matches job's 'ncpus' value.
  1243. self.license_count_match(8)
  1244. # Check various vnode status.
  1245. # Check various vnode status.
  1246. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1247. 'job-busy', jobs_assn1, 1, '1048576kb')
  1248. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  1249. jobs_assn1, 1, '0kb')
  1250. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1251. 2, '2097152kb')
  1252. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1253. # Check for no existence of account update ('u') record
  1254. self.server.accounting_match(
  1255. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  1256. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1257. # Check for no existence of account next ('c') record
  1258. self.server.accounting_match(
  1259. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  1260. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1261. def test_release_nodes_error(self):
  1262. """
  1263. Tests erroneous cases:
  1264. - pbs_release_nodes (no options given)
  1265. - pbs_release_nodes -j <job-id> (and nothing else)
  1266. - pbs_release_nodes -a (not run inside a job)
  1267. - pbs_release_nodes -j <job-id> -a <node1>
  1268. (both -a and listed nodes are given)
  1269. - pbs_release_nodes -j <unknown-job-id> -a
  1270. - pbs_release_nodes -j <job-id> -a
  1271. and job is not in a running state.
  1272. Returns the appropriate error message.
  1273. """
  1274. # Test no option given
  1275. cmd = [self.pbs_release_nodes_cmd]
  1276. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1277. runas=TEST_USER)
  1278. self.assertNotEqual(ret['rc'], 0)
  1279. self.assertTrue(ret['err'][0].startswith('usage:'))
  1280. # test only -j <jobid> given
  1281. cmd = [self.pbs_release_nodes_cmd, '-j', '23']
  1282. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1283. runas=TEST_USER)
  1284. self.assertNotEqual(ret['rc'], 0)
  1285. self.assertTrue(ret['err'][0].startswith('usage:'))
  1286. # test only -a given
  1287. cmd = [self.pbs_release_nodes_cmd, '-a']
  1288. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1289. runas=TEST_USER)
  1290. self.assertNotEqual(ret['rc'], 0)
  1291. self.assertTrue(ret['err'][0].startswith(
  1292. 'pbs_release_nodes: No jobid given'))
  1293. # Test specifying an unknown job id
  1294. cmd = [self.pbs_release_nodes_cmd, '-j', '300000', '-a']
  1295. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1296. runas=TEST_USER)
  1297. self.assertNotEqual(ret['rc'], 0)
  1298. self.assertTrue(ret['err'][0].startswith(
  1299. 'pbs_release_nodes: Unknown Job Id 300000'))
  1300. # Test having '-a' and vnode parameter given to pbs_release_nodes
  1301. a = {'Resource_List.select': '3:ncpus=1',
  1302. 'Resource_List.place': 'scatter'}
  1303. jid = self.create_and_submit_job('job', a)
  1304. cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a', self.n4]
  1305. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1306. runas=TEST_USER)
  1307. self.assertNotEqual(ret['rc'], 0)
  1308. self.assertTrue(ret['err'][0].startswith('usage:'))
  1309. self.server.delete(jid)
  1310. # Test pbs_release_nodes' permission
  1311. jid = self.create_and_submit_job('job', a)
  1312. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  1313. # Run pbs_release_nodes as the executing user != TEST_USER
  1314. cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
  1315. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1316. runas=TEST_USER1)
  1317. self.assertNotEqual(ret['rc'], 0)
  1318. self.assertTrue(ret['err'][0].startswith(
  1319. 'pbs_release_nodes: Unauthorized Request'))
  1320. self.server.delete(jid)
  1321. # Test pbs_release_nodes on a non-running job
  1322. a = {'Resource_List.select': '3:ncpus=1',
  1323. ATTR_h: None,
  1324. 'Resource_List.place': 'scatter'}
  1325. jid = self.create_and_submit_job('job', a)
  1326. self.server.expect(JOB, {'job_state': 'H'}, id=jid)
  1327. # Run pbs_release_nodes
  1328. cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
  1329. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1330. runas=TEST_USER)
  1331. self.assertNotEqual(ret['rc'], 0)
  1332. self.assertTrue(ret['err'][0].startswith(
  1333. 'pbs_release_nodes: Request invalid for state of job'))
  1334. def test_release_ms_nodes(self):
  1335. """
  1336. Test:
  1337. Given: a job that has been submitted with a select spec
  1338. of 2 super-chunks of ncpus=3 and mem=2gb each,
  1339. and 1 chunk of ncpus=2 and mem=2gb, along with
  1340. place spec of "scatter", resulting in an
  1341. exec_vnode=
  1342. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  1343. Executing pbs_release_nodes -j <job-id> <n5> <n6> <n1> <n7> where
  1344. <n1> is a mother superior vnode, results in
  1345. entire request to get rejected.
  1346. """
  1347. jid = self.create_and_submit_job('job1')
  1348. self.server.expect(JOB, {'job_state': 'R',
  1349. 'Resource_List.mem': '6gb',
  1350. 'Resource_List.ncpus': 8,
  1351. 'Resource_List.nodect': 3,
  1352. 'Resource_List.select': self.job1_select,
  1353. 'Resource_List.place': self.job1_place,
  1354. 'schedselect': self.job1_schedselect,
  1355. 'exec_host': self.job1_exec_host,
  1356. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1357. # server's license_count used value matches job's 'ncpus' value.
  1358. self.license_count_match(8)
  1359. # Check various vnode status.
  1360. jobs_assn1 = "%s/0" % (jid,)
  1361. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1362. 'job-busy', jobs_assn1, 1, '1048576kb')
  1363. self.match_vnode_status([self.n3, self.n6],
  1364. 'job-busy', jobs_assn1, 1, '0kb')
  1365. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1366. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1367. 2, '2097152kb')
  1368. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1369. # Run pbs_release_nodes
  1370. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6,
  1371. self.n1, self.n7]
  1372. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1373. runas=TEST_USER)
  1374. self.assertNotEqual(ret['rc'], 0)
  1375. self.assertTrue(ret['err'][0].startswith(
  1376. "pbs_release_nodes: " +
  1377. "Can't free '%s' since " % (self.n1,) +
  1378. "it's on a primary execution host"))
  1379. self.server.expect(JOB, {'job_state': 'R',
  1380. 'Resource_List.mem': '6gb',
  1381. 'Resource_List.ncpus': 8,
  1382. 'Resource_List.nodect': 3,
  1383. 'Resource_List.select': self.job1_select,
  1384. 'Resource_List.place': self.job1_place,
  1385. 'schedselect': self.job1_schedselect,
  1386. 'exec_host': self.job1_exec_host,
  1387. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1388. # server's license_count used value matches job's 'ncpus' value.
  1389. self.license_count_match(8)
  1390. # Check various vnode status.
  1391. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1392. 'job-busy', jobs_assn1, 1, '1048576kb')
  1393. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  1394. jobs_assn1, 1, '0kb')
  1395. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1396. 2, '2097152kb')
  1397. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1398. # Check for no existence of account update ('u') record
  1399. self.server.accounting_match(
  1400. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  1401. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1402. # Check for no existence of account next ('c') record
  1403. self.server.accounting_match(
  1404. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  1405. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1406. def test_release_not_assigned_nodes(self):
  1407. """
  1408. Test:
  1409. Given: a job that has been submitted with a select spec
  1410. of 2 super-chunks of ncpus=3 and mem=2gb each,
  1411. and 1 chunk of ncpus=2 and mem=2gb, along with
  1412. place spec of "scatter", resulting in an
  1413. exec_vnode=
  1414. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  1415. Executing:
  1416. pbs_release_nodes -j <job-id> <n4> <n5> <no_node> <n6> <n7>
  1417. with <no node> means such node is not assigned to the job.
  1418. entire request to get rejected.
  1419. Result:
  1420. Returns an error message and no nodes get released.
  1421. """
  1422. jid = self.create_and_submit_job('job1')
  1423. self.server.expect(JOB, {'job_state': 'R',
  1424. 'Resource_List.mem': '6gb',
  1425. 'Resource_List.ncpus': 8,
  1426. 'Resource_List.nodect': 3,
  1427. 'Resource_List.select': self.job1_select,
  1428. 'Resource_List.place': self.job1_place,
  1429. 'schedselect': self.job1_schedselect,
  1430. 'exec_host': self.job1_exec_host,
  1431. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1432. # server's license_count used value matches job's 'ncpus' value.
  1433. self.license_count_match(8)
  1434. # Check various vnode status.
  1435. jobs_assn1 = "%s/0" % (jid,)
  1436. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1437. 'job-busy', jobs_assn1, 1, '1048576kb')
  1438. jobs_assn1 = "%s/0" % (jid,)
  1439. self.match_vnode_status([self.n3, self.n6],
  1440. 'job-busy', jobs_assn1, 1, '0kb')
  1441. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1442. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1443. 2, '2097152kb')
  1444. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1445. self.assertTrue(
  1446. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1447. # Run pbs_release_nodes
  1448. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  1449. self.n8, self.n6, self.n7]
  1450. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1451. runas=TEST_USER)
  1452. self.assertNotEqual(ret['rc'], 0)
  1453. self.assertTrue(ret['err'][0].startswith(
  1454. "pbs_release_nodes: node(s) requested " +
  1455. "to be released not " +
  1456. "part of the job: %s" % (self.n8,)))
  1457. # Ensure nothing has changed with the job.
  1458. self.server.expect(JOB, {'job_state': 'R',
  1459. 'Resource_List.mem': '6gb',
  1460. 'Resource_List.ncpus': 8,
  1461. 'Resource_List.nodect': 3,
  1462. 'Resource_List.select': self.job1_select,
  1463. 'Resource_List.place': self.job1_place,
  1464. 'schedselect': self.job1_schedselect,
  1465. 'exec_host': self.job1_exec_host,
  1466. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1467. # server's license_count used value matches job's 'ncpus' value.
  1468. self.license_count_match(8)
  1469. # Check various vnode status.
  1470. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1471. 'job-busy', jobs_assn1, 1, '1048576kb')
  1472. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  1473. jobs_assn1, 1, '0kb')
  1474. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1475. 2, '2097152kb')
  1476. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1477. # Check for no existence of account update ('u') record
  1478. self.server.accounting_match(
  1479. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  1480. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1481. # Check for no existence of account next ('c') record
  1482. self.server.accounting_match(
  1483. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  1484. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1485. def test_release_cray_nodes(self):
  1486. """
  1487. Test:
  1488. Given: a job that has been submitted with a select spec
  1489. of 2 super-chunks of ncpus=3 and mem=2gb each,
  1490. and 1 chunk of ncpus=2 and mem=2gb, along with
  1491. place spec of "scatter", resulting in an
  1492. exec_vnode=
  1493. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  1494. Executing:
  1495. pbs_release_nodes -j <job-id> <n4> <n5> <n6> <n7>
  1496. where <n7> is a Cray node,
  1497. Result:
  1498. Returns an error message and no nodes get released.
  1499. """
  1500. jid = self.create_and_submit_job('job1')
  1501. self.server.expect(JOB, {'job_state': 'R',
  1502. 'Resource_List.mem': '6gb',
  1503. 'Resource_List.ncpus': 8,
  1504. 'Resource_List.nodect': 3,
  1505. 'Resource_List.select': self.job1_select,
  1506. 'Resource_List.place': self.job1_place,
  1507. 'schedselect': self.job1_schedselect,
  1508. 'exec_host': self.job1_exec_host,
  1509. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1510. # server's license_count used value matches job's 'ncpus' value.
  1511. self.license_count_match(8)
  1512. # Check various vnode status.
  1513. jobs_assn1 = "%s/0" % (jid,)
  1514. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1515. 'job-busy', jobs_assn1, 1, '1048576kb')
  1516. jobs_assn1 = "%s/0" % (jid,)
  1517. self.match_vnode_status([self.n3, self.n6],
  1518. 'job-busy', jobs_assn1, 1, '0kb')
  1519. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1520. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1521. 2, '2097152kb')
  1522. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1523. self.assertTrue(
  1524. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1525. # Set hostC node to be of cray type
  1526. a = {'resources_available.vntype': 'cray_login'}
  1527. # set natural vnode of hostC
  1528. self.server.manager(MGR_CMD_SET, NODE, a, id=self.n7,
  1529. expect=True)
  1530. # Run pbs_release_nodes
  1531. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  1532. self.n6, self.n7]
  1533. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1534. runas=TEST_USER)
  1535. self.assertNotEqual(ret['rc'], 0)
  1536. self.assertTrue(ret['err'][0].startswith(
  1537. "pbs_release_nodes: not currently supported " +
  1538. "on Cray X* series nodes: "
  1539. "%s" % (self.n7,)))
  1540. # Ensure nothing has changed with the job.
  1541. self.server.expect(JOB, {'job_state': 'R',
  1542. 'Resource_List.mem': '6gb',
  1543. 'Resource_List.ncpus': 8,
  1544. 'Resource_List.nodect': 3,
  1545. 'Resource_List.select': self.job1_select,
  1546. 'Resource_List.place': self.job1_place,
  1547. 'schedselect': self.job1_schedselect,
  1548. 'exec_host': self.job1_exec_host,
  1549. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1550. # server's license_count used value matches job's 'ncpus' value.
  1551. self.license_count_match(8)
  1552. # Check various vnode status.
  1553. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1554. 'job-busy', jobs_assn1, 1, '1048576kb')
  1555. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  1556. jobs_assn1, 1, '0kb')
  1557. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1558. 2, '2097152kb')
  1559. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1560. # Check for no existence of account update ('u') record
  1561. self.server.accounting_match(
  1562. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  1563. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1564. # Check for no existence of account next ('c') record
  1565. self.server.accounting_match(
  1566. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  1567. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1568. def test_release_cpuset_nodes(self):
  1569. """
  1570. Test:
  1571. Given: a job that has been submitted with a select spec
  1572. of 2 super-chunks of ncpus=3 and mem=2gb each,
  1573. and 1 chunk of ncpus=2 and mem=2gb, along with
  1574. place spec of "scatter", resulting in an
  1575. exec_vnode=
  1576. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  1577. Executing:
  1578. pbs_release_nodes -j <job-id> <n4> <n5> <n6> <n7>
  1579. where <n7> is a cpuset node,
  1580. Result:
  1581. Returns an error message and no nodes get released.
  1582. """
  1583. jid = self.create_and_submit_job('job1')
  1584. self.server.expect(JOB, {'job_state': 'R',
  1585. 'Resource_List.mem': '6gb',
  1586. 'Resource_List.ncpus': 8,
  1587. 'Resource_List.nodect': 3,
  1588. 'Resource_List.select': self.job1_select,
  1589. 'Resource_List.place': self.job1_place,
  1590. 'schedselect': self.job1_schedselect,
  1591. 'exec_host': self.job1_exec_host,
  1592. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1593. # server's license_count used value matches job's 'ncpus' value.
  1594. self.license_count_match(8)
  1595. # Check various vnode status.
  1596. jobs_assn1 = "%s/0" % (jid,)
  1597. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1598. 'job-busy', jobs_assn1, 1, '1048576kb')
  1599. jobs_assn1 = "%s/0" % (jid,)
  1600. self.match_vnode_status([self.n3, self.n6],
  1601. 'job-busy', jobs_assn1, 1, '0kb')
  1602. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1603. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1604. 2, '2097152kb')
  1605. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1606. self.assertTrue(
  1607. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1608. # Set hostB to be of cpuset type
  1609. a = {'resources_available.arch': 'linux_cpuset'}
  1610. # set natural vnode of hostC
  1611. self.server.manager(MGR_CMD_SET, NODE, a, id=self.n7,
  1612. expect=True)
  1613. # Run pbs_release_nodes
  1614. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  1615. self.n6, self.n7]
  1616. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1617. runas=TEST_USER)
  1618. self.assertNotEqual(ret['rc'], 0)
  1619. self.assertTrue(ret['err'][0].startswith(
  1620. "pbs_release_nodes: not currently supported on nodes whose " +
  1621. "resources are part of a cpuset: %s" % (self.n7,)))
  1622. # Ensure nothing has changed with the job.
  1623. self.server.expect(JOB, {'job_state': 'R',
  1624. 'Resource_List.mem': '6gb',
  1625. 'Resource_List.ncpus': 8,
  1626. 'Resource_List.nodect': 3,
  1627. 'Resource_List.select': self.job1_select,
  1628. 'Resource_List.place': self.job1_place,
  1629. 'schedselect': self.job1_schedselect,
  1630. 'exec_host': self.job1_exec_host,
  1631. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1632. # server's license_count used value matches job's 'ncpus' value.
  1633. self.license_count_match(8)
  1634. # Check various vnode status.
  1635. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1636. 'job-busy', jobs_assn1, 1, '1048576kb')
  1637. self.match_vnode_status([self.n3, self.n6], 'job-busy',
  1638. jobs_assn1, 1, '0kb')
  1639. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1640. 2, '2097152kb')
  1641. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1642. # Check for no existence of account update ('u') record
  1643. self.server.accounting_match(
  1644. msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
  1645. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1646. # Check for no existence of account next ('c') record
  1647. self.server.accounting_match(
  1648. msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
  1649. regexp=True, n=20, existence=False, max_attempts=5, interval=1)
  1650. def test_release_nodes_all(self):
  1651. """
  1652. Test:
  1653. Given a job that specifies a select spec of
  1654. 2 super-chunks of ncpus=3 and mem=2gb each,
  1655. and 1 chunk of ncpus=2 and mem=2gb, along with
  1656. place spec of "scatter".
  1657. Calling
  1658. pbs_release_nodes -j <job-id> -a
  1659. will result in all the sister nodes getting
  1660. unassigned from the job.
  1661. """
  1662. jid = self.create_and_submit_job('job1_2')
  1663. self.server.expect(JOB, {'job_state': 'R',
  1664. 'Resource_List.mem': '6gb',
  1665. 'Resource_List.ncpus': 8,
  1666. 'Resource_List.nodect': 3,
  1667. 'Resource_List.select': self.job1_select,
  1668. 'Resource_List.place': self.job1_place,
  1669. 'schedselect': self.job1_schedselect,
  1670. 'exec_host': self.job1_exec_host,
  1671. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1672. # server's license_count used value matches job's 'ncpus' value.
  1673. self.license_count_match(8)
  1674. # Check various vnode status.
  1675. jobs_assn1 = "%s/0" % (jid,)
  1676. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1677. 'job-busy', jobs_assn1, 1, '1048576kb')
  1678. self.match_vnode_status([self.n3, self.n6],
  1679. 'job-busy', jobs_assn1, 1, '0kb')
  1680. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1681. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1682. 2, '2097152kb')
  1683. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1684. self.assertTrue(
  1685. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1686. # Run pbs_release_nodes as regular user
  1687. cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
  1688. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1689. runas=TEST_USER)
  1690. self.assertEqual(ret['rc'], 0)
  1691. # Verify mom_logs
  1692. self.momA.log_match(
  1693. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  1694. max_attempts=18, interval=2, regexp=True)
  1695. self.momA.log_match(
  1696. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  1697. max_attempts=18, interval=2, regexp=True)
  1698. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1699. max_attempts=18, interval=2)
  1700. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1701. max_attempts=18, interval=2)
  1702. # Verify remaining job resources.
  1703. self.server.expect(JOB, {'job_state': 'R',
  1704. 'Resource_List.mem': '2gb',
  1705. 'Resource_List.ncpus': 3,
  1706. 'Resource_List.select': self.job1_newsel,
  1707. 'Resource_List.place': self.job1_place,
  1708. 'Resource_List.nodect': 1,
  1709. 'schedselect': self.job1_newsel,
  1710. 'exec_host': self.job1_new_exec_host,
  1711. 'exec_vnode': self.job1_new_exec_vnode},
  1712. id=jid)
  1713. # server's license_count used value matches job's 'ncpus' value.
  1714. self.license_count_match(3)
  1715. # Check various vnode status.
  1716. self.match_vnode_status([self.n1, self.n2],
  1717. 'job-busy', jobs_assn1, 1, '1048576kb')
  1718. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  1719. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  1720. self.n7, self.n8, self.n9, self.n10], 'free')
  1721. self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
  1722. 'resources_assigned.mem': '2097152kb'})
  1723. self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
  1724. 'resources_assigned.mem': '2097152kb'},
  1725. id="workq")
  1726. self.assertTrue(
  1727. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  1728. # Check account update ('u') record
  1729. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  1730. self.job1_exec_vnode_esc, "6gb", 8, 3,
  1731. self.job1_place,
  1732. self.job1_sel_esc)
  1733. # Check to make sure 'c' (next) record got generated
  1734. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  1735. self.job1_new_exec_vnode_esc, "2097152kb",
  1736. 3, 1, self.job1_place, self.job1_newsel)
  1737. def test_release_nodes_all_as_root(self):
  1738. """
  1739. Test:
  1740. Same test as test_release_nodes_all except the pbs_release_nodes
  1741. call is executed by root. Result is the same.
  1742. """
  1743. jid = self.create_and_submit_job('job1_2')
  1744. self.server.expect(JOB, {'job_state': 'R',
  1745. 'Resource_List.mem': '6gb',
  1746. 'Resource_List.ncpus': 8,
  1747. 'Resource_List.nodect': 3,
  1748. 'Resource_List.select': self.job1_select,
  1749. 'Resource_List.place': self.job1_place,
  1750. 'schedselect': self.job1_schedselect,
  1751. 'exec_host': self.job1_exec_host,
  1752. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1753. # server's license_count used value matches job's 'ncpus' value.
  1754. self.license_count_match(8)
  1755. # Check various vnode status.
  1756. jobs_assn1 = "%s/0" % (jid,)
  1757. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1758. 'job-busy', jobs_assn1, 1, '1048576kb')
  1759. self.match_vnode_status([self.n3, self.n6],
  1760. 'job-busy', jobs_assn1, 1, '0kb')
  1761. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1762. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1763. 2, '2097152kb')
  1764. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1765. self.assertTrue(
  1766. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1767. # Run pbs_release_nodes as root
  1768. cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
  1769. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1770. sudo=True)
  1771. self.assertEqual(ret['rc'], 0)
  1772. # Verify mom_logs
  1773. self.momA.log_match(
  1774. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  1775. max_attempts=18, interval=2, regexp=True)
  1776. self.momA.log_match(
  1777. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  1778. max_attempts=18, interval=2, regexp=True)
  1779. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1780. max_attempts=18, interval=2)
  1781. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1782. max_attempts=18, interval=2)
  1783. # Verify remaining job resources.
  1784. self.server.expect(JOB, {'job_state': 'R',
  1785. 'Resource_List.mem': '2gb',
  1786. 'Resource_List.ncpus': 3,
  1787. 'Resource_List.select': self.job1_newsel,
  1788. 'Resource_List.place': self.job1_place,
  1789. 'Resource_List.nodect': 1,
  1790. 'schedselect': self.job1_newsel,
  1791. 'exec_host': self.job1_new_exec_host,
  1792. 'exec_vnode': self.job1_new_exec_vnode},
  1793. id=jid)
  1794. # server's license_count used value matches job's 'ncpus' value.
  1795. self.license_count_match(3)
  1796. # Check various vnode status.
  1797. self.match_vnode_status([self.n1, self.n2],
  1798. 'job-busy', jobs_assn1, 1, '1048576kb')
  1799. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  1800. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  1801. self.n7, self.n8, self.n9, self.n10], 'free')
  1802. self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
  1803. 'resources_assigned.mem': '2097152kb'})
  1804. self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
  1805. 'resources_assigned.mem': '2097152kb'},
  1806. id="workq")
  1807. self.assertTrue(
  1808. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  1809. # Check account update ('u') record
  1810. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  1811. self.job1_exec_vnode_esc, "6gb", 8, 3,
  1812. self.job1_place,
  1813. self.job1_sel_esc)
  1814. # Check to make sure 'c' (next) record got generated
  1815. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  1816. self.job1_new_exec_vnode_esc, "2097152kb",
  1817. 3, 1, self.job1_place, self.job1_newsel)
  1818. def test_release_nodes_all_inside_job(self):
  1819. """
  1820. Test:
  1821. Like test_release_all test except instead of calling
  1822. pbs_release_nodes from a command line, it is executed
  1823. inside the job script of a running job. Same results.
  1824. """
  1825. # This one has a job script that calls 'pbs_release_nodes'
  1826. # (no jobid specified)
  1827. jid = self.create_and_submit_job('job1_3')
  1828. self.server.expect(JOB, {'job_state': 'R',
  1829. 'Resource_List.mem': '6gb',
  1830. 'Resource_List.ncpus': 8,
  1831. 'Resource_List.nodect': 3,
  1832. 'Resource_List.select': self.job1_select,
  1833. 'Resource_List.place': self.job1_place,
  1834. 'schedselect': self.job1_schedselect,
  1835. 'exec_host': self.job1_exec_host,
  1836. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1837. # server's license_count used value matches job's 'ncpus' value.
  1838. self.license_count_match(8)
  1839. # Check various vnode status.
  1840. jobs_assn1 = "%s/0" % (jid,)
  1841. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1842. 'job-busy', jobs_assn1, 1, '1048576kb')
  1843. self.match_vnode_status([self.n3, self.n6],
  1844. 'job-busy', jobs_assn1, 1, '0kb')
  1845. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1846. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1847. 2, '2097152kb')
  1848. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1849. self.assertTrue(
  1850. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1851. # wait for the job to execute pbs_release_nodes
  1852. time.sleep(10)
  1853. # Verify mom_logs
  1854. self.momA.log_match(
  1855. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  1856. max_attempts=18, interval=2, regexp=True)
  1857. self.momA.log_match(
  1858. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  1859. max_attempts=18, interval=2, regexp=True)
  1860. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1861. max_attempts=18, interval=2)
  1862. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1863. max_attempts=18, interval=2)
  1864. # Verify remaining job resources.
  1865. self.server.expect(JOB, {'job_state': 'R',
  1866. 'Resource_List.mem': '2gb',
  1867. 'Resource_List.ncpus': 3,
  1868. 'Resource_List.select': self.job1_newsel,
  1869. 'Resource_List.place': self.job1_place,
  1870. 'Resource_List.nodect': 1,
  1871. 'schedselect': self.job1_newsel,
  1872. 'exec_host': self.job1_new_exec_host,
  1873. 'exec_vnode': self.job1_new_exec_vnode},
  1874. id=jid)
  1875. # server's license_count used value matches job's 'ncpus' value.
  1876. self.license_count_match(3)
  1877. # Check various vnode status.
  1878. self.match_vnode_status([self.n1, self.n2],
  1879. 'job-busy', jobs_assn1, 1, '1048576kb')
  1880. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  1881. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  1882. self.n7, self.n8, self.n9, self.n10], 'free')
  1883. self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
  1884. 'resources_assigned.mem': '2097152kb'})
  1885. self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
  1886. 'resources_assigned.mem': '2097152kb'},
  1887. id="workq")
  1888. self.assertTrue(
  1889. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  1890. # Check account update ('u') record
  1891. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  1892. self.job1_exec_vnode_esc, "6gb", 8, 3,
  1893. self.job1_place,
  1894. self.job1_sel_esc)
  1895. # Check to make sure 'c' (next) record got generated
  1896. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  1897. self.job1_new_exec_vnode_esc, "2097152kb",
  1898. 3, 1, self.job1_place, self.job1_newsel)
  1899. def test_release_nodes1(self):
  1900. """
  1901. Test:
  1902. Given: a job that has been submitted with a select spec
  1903. of 2 super-chunks of ncpus=3 and mem=2gb each,
  1904. and 1 chunk of ncpus=2 and mem=2gb, along with
  1905. place spec of "scatter", resulting in an
  1906. exec_vnode=
  1907. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  1908. Executing pbs_release_nodes -j <job-id> <n4>
  1909. results in:
  1910. 1. node <n4> no longer appearing in job's
  1911. exec_vnode value,
  1912. 2. resources associated with the
  1913. node are taken out of job's Resources_List.*,
  1914. schedselect values,
  1915. 3. Since node <n4> is just one of the vnodes in the
  1916. host assigned to the second super-chunk, the node
  1917. still won't accept new jobs until all the other
  1918. allocated vnodes from the same mom host are released.
  1919. The resources then assigned to the job from
  1920. node <n4> continues to be assigned including
  1921. corresponding licenses.
  1922. NOTE: This is testing to make sure the position of <n4>
  1923. in the exec_vnode string (left end of a super-chunk) will
  1924. not break the recreation of the attribute value after
  1925. release.
  1926. """
  1927. jid = self.create_and_submit_job('job1_5')
  1928. self.server.expect(JOB, {'job_state': 'R',
  1929. 'Resource_List.mem': '6gb',
  1930. 'Resource_List.ncpus': 8,
  1931. 'Resource_List.nodect': 3,
  1932. 'Resource_List.select': self.job1_select,
  1933. 'Resource_List.place': self.job1_place,
  1934. 'schedselect': self.job1_schedselect,
  1935. 'exec_host': self.job1_exec_host,
  1936. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  1937. # server's license_count used value matches job's 'ncpus' value.
  1938. self.license_count_match(8)
  1939. # Check various vnode status.
  1940. jobs_assn1 = "%s/0" % (jid,)
  1941. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  1942. 'job-busy', jobs_assn1, 1, '1048576kb')
  1943. self.match_vnode_status([self.n3, self.n6],
  1944. 'job-busy', jobs_assn1, 1, '0kb')
  1945. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  1946. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  1947. 2, '2097152kb')
  1948. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  1949. self.assertTrue(
  1950. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1951. # Run pbs_release_nodes as root
  1952. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  1953. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  1954. sudo=True)
  1955. self.assertEqual(ret['rc'], 0)
  1956. # Verify mom_logs
  1957. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  1958. jid, self.hostB), n=10,
  1959. regexp=True,
  1960. existence=False, max_attempts=5, interval=1)
  1961. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  1962. jid, self.hostC), n=10,
  1963. regexp=True,
  1964. existence=False, max_attempts=5, interval=1)
  1965. # momB's host will not get DELETE_JOB2 request since
  1966. # not all its vnodes have been released yet from the job.
  1967. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  1968. existence=False, max_attempts=5, interval=1)
  1969. # Verify remaining job resources.
  1970. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  1971. "1:ncpus=2:mem=2097152kb"
  1972. newsel_esc = newsel.replace("+", "\+")
  1973. new_exec_host = self.job1_exec_host
  1974. new_exec_host_esc = self.job1_exec_host.replace(
  1975. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  1976. new_exec_vnode = self.job1_exec_vnode.replace(
  1977. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  1978. new_exec_vnode_esc = new_exec_vnode.replace(
  1979. "[", "\[").replace("]", "\]").replace(
  1980. "(", "\(").replace(")", "\)").replace("+", "\+")
  1981. self.server.expect(JOB, {'job_state': 'R',
  1982. 'Resource_List.mem': '5gb',
  1983. 'Resource_List.ncpus': 7,
  1984. 'Resource_List.select': newsel,
  1985. 'Resource_List.place': self.job1_place,
  1986. 'Resource_List.nodect': 3,
  1987. 'schedselect': newsel,
  1988. 'exec_host': self.job1_exec_host,
  1989. 'exec_vnode': new_exec_vnode}, id=jid)
  1990. # Though the job is listed with ncpus=7 taking away released vnode
  1991. # <n4>, it's coming from a super-chunk where other vnodes <n5> and
  1992. # <n6> are still assigned to the job. So the parent mom of <n4>
  1993. # till won't release the job and thus, the 1 license for it is still
  1994. # allocated.
  1995. self.license_count_match(8)
  1996. # Check account update ('u') record
  1997. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  1998. self.job1_exec_vnode_esc, "6gb", 8, 3,
  1999. self.job1_place,
  2000. self.job1_sel_esc)
  2001. # Check to make sure 'c' (next) record got generated
  2002. self.match_accounting_log('c', jid, self.job1_exec_host_esc,
  2003. new_exec_vnode_esc, "5242880kb",
  2004. 7, 3, self.job1_place, newsel_esc)
  2005. # Check various vnode status.
  2006. jobs_assn1 = "%s/0" % (jid,)
  2007. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2008. 'job-busy', jobs_assn1, 1, '1048576kb')
  2009. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2010. '0kb')
  2011. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2012. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2013. 2, '2097152kb')
  2014. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2015. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2016. 'resources_assigned.mem': '6291456kb'})
  2017. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2018. 'resources_assigned.mem': '6291456kb'},
  2019. id="workq")
  2020. self.assertTrue(
  2021. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  2022. self.server.delete(jid)
  2023. # Check account phased end ('e') record
  2024. self.match_accounting_log('e', jid, new_exec_host_esc,
  2025. new_exec_vnode_esc,
  2026. "5242880kb", 7, 3,
  2027. self.job1_place,
  2028. newsel_esc)
  2029. # Check to make sure 'E' (end of job) record got generated
  2030. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  2031. self.job1_exec_vnode_esc, "6gb",
  2032. 8, 3, self.job1_place, self.job1_sel_esc)
  2033. def test_release_nodes1_as_user(self):
  2034. """
  2035. Test:
  2036. Same as test_release_nodes1 except pbs_release_nodes
  2037. is executed by as regular user. Same results.
  2038. """
  2039. jid = self.create_and_submit_job('job1_5')
  2040. self.server.expect(JOB, {'job_state': 'R',
  2041. 'Resource_List.mem': '6gb',
  2042. 'Resource_List.ncpus': 8,
  2043. 'Resource_List.nodect': 3,
  2044. 'Resource_List.select': self.job1_select,
  2045. 'Resource_List.place': self.job1_place,
  2046. 'schedselect': self.job1_schedselect,
  2047. 'exec_host': self.job1_exec_host,
  2048. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  2049. # server's license_count used value matches job's 'ncpus' value.
  2050. self.license_count_match(8)
  2051. # Check various vnode status.
  2052. jobs_assn1 = "%s/0" % (jid,)
  2053. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2054. 'job-busy', jobs_assn1, 1, '1048576kb')
  2055. self.match_vnode_status([self.n3, self.n6],
  2056. 'job-busy', jobs_assn1, 1, '0kb')
  2057. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2058. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2059. 2, '2097152kb')
  2060. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2061. self.assertTrue(
  2062. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  2063. # Run pbs_release_nodes as regular user
  2064. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  2065. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2066. runas=TEST_USER)
  2067. self.assertEqual(ret['rc'], 0)
  2068. # Verify mom_logs
  2069. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2070. jid, self.hostB), n=10,
  2071. regexp=True,
  2072. existence=False, max_attempts=5, interval=1)
  2073. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2074. jid, self.hostC), n=10,
  2075. regexp=True,
  2076. existence=False, max_attempts=5, interval=1)
  2077. # momB and momC's hosts will not get DELETE_JOB2 request since
  2078. # not all their vnodes have been released yet from the job.
  2079. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2080. existence=False, max_attempts=5, interval=1)
  2081. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2082. existence=False, max_attempts=5, interval=1)
  2083. # Verify remaining job resources.
  2084. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  2085. "1:ncpus=2:mem=2097152kb"
  2086. newsel_esc = newsel.replace("+", "\+")
  2087. new_exec_host = self.job1_exec_host
  2088. new_exec_host_esc = self.job1_exec_host.replace(
  2089. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2090. new_exec_vnode = self.job1_exec_vnode.replace(
  2091. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  2092. new_exec_vnode_esc = new_exec_vnode.replace(
  2093. "[", "\[").replace("]", "\]").replace(
  2094. "(", "\(").replace(")", "\)").replace("+", "\+")
  2095. self.server.expect(JOB, {'job_state': 'R',
  2096. 'Resource_List.mem': '5gb',
  2097. 'Resource_List.ncpus': 7,
  2098. 'Resource_List.select': newsel,
  2099. 'Resource_List.place': self.job1_place,
  2100. 'Resource_List.nodect': 3,
  2101. 'schedselect': newsel,
  2102. 'exec_host': self.job1_exec_host,
  2103. 'exec_vnode': new_exec_vnode}, id=jid)
  2104. # Though the job is listed with ncpus=7 taking away released vnode
  2105. # <n4>, it's coming from a super-chunk where other vnodes <n5> and
  2106. # <n6> are still assigned to the job. So the parent mom of <n4>
  2107. # till won't release the job and thus, the 1 license for it is still
  2108. # allocated.
  2109. self.license_count_match(8)
  2110. # Check account update ('u') record
  2111. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  2112. self.job1_exec_vnode_esc, "6gb", 8, 3,
  2113. self.job1_place,
  2114. self.job1_sel_esc)
  2115. # Check to make sure 'c' (next) record got generated
  2116. self.match_accounting_log('c', jid, self.job1_exec_host_esc,
  2117. new_exec_vnode_esc, "5242880kb",
  2118. 7, 3, self.job1_place, newsel_esc)
  2119. # Check various vnode status.
  2120. jobs_assn1 = "%s/0" % (jid,)
  2121. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2122. 'job-busy', jobs_assn1, 1, '1048576kb')
  2123. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2124. '0kb')
  2125. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2126. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2127. 2, '2097152kb')
  2128. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2129. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2130. 'resources_assigned.mem': '6291456kb'})
  2131. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2132. 'resources_assigned.mem': '6291456kb'},
  2133. id="workq")
  2134. self.assertTrue(
  2135. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  2136. self.server.delete(jid)
  2137. # Check account phased end ('e') record
  2138. self.match_accounting_log('e', jid, new_exec_host_esc,
  2139. new_exec_vnode_esc,
  2140. "5242880kb", 7, 3,
  2141. self.job1_place,
  2142. newsel_esc)
  2143. # Check to make sure 'E' (end of job) record got generated
  2144. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  2145. self.job1_exec_vnode_esc, "6gb",
  2146. 8, 3, self.job1_place, self.job1_sel_esc)
  2147. def test_release_nodes1_extra(self):
  2148. """
  2149. Test:
  2150. Like test_release_nodes1 except instead of the super-chunk
  2151. and chunks getting only ncpus and mem values, additional
  2152. resources mpiprocs and ompthreads are also requested and
  2153. assigned:
  2154. For example:
  2155. qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
  2156. ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
  2157. ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  2158. We want to make sure the ompthreads and mpiprocs values are
  2159. preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
  2160. the host names are duplicated according to the number of
  2161. mpiprocs. For example, if <n1> is assigned to first
  2162. chunk, with mpiprocs=3, <n1> will appear 3 times in
  2163. $PBS_NODEFILE.
  2164. """
  2165. jid = self.create_and_submit_job('job1_extra_res')
  2166. self.server.expect(JOB, {'job_state': 'R',
  2167. 'Resource_List.mem': '6gb',
  2168. 'Resource_List.ncpus': 8,
  2169. 'Resource_List.nodect': 3,
  2170. 'Resource_List.select':
  2171. self.job1_extra_res_select,
  2172. 'Resource_List.place':
  2173. self.job1_extra_res_place,
  2174. 'schedselect':
  2175. self.job1_extra_res_schedselect,
  2176. 'exec_host':
  2177. self.job1_extra_res_exec_host,
  2178. 'exec_vnode':
  2179. self.job1_extra_res_exec_vnode}, id=jid)
  2180. # server's license_count used value matches job's 'ncpus' value.
  2181. self.license_count_match(8)
  2182. # Check various vnode status.
  2183. jobs_assn1 = "%s/0" % (jid,)
  2184. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2185. 'job-busy', jobs_assn1, 1, '1048576kb')
  2186. self.match_vnode_status([self.n3, self.n6],
  2187. 'job-busy', jobs_assn1, 1, '0kb')
  2188. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2189. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2190. 2, '2097152kb')
  2191. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2192. # inside pbs_nodefile_match_exec_host() function, takes care of
  2193. # verifying that the host names appear according to the number of
  2194. # mpiprocs assigned to the chunk.
  2195. self.assertTrue(
  2196. self.pbs_nodefile_match_exec_host(
  2197. jid, self.job1_extra_res_exec_host,
  2198. self.job1_extra_res_schedselect))
  2199. # Run pbs_release_nodes as root
  2200. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  2201. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2202. sudo=True)
  2203. self.assertEqual(ret['rc'], 0)
  2204. # Verify mom_logs
  2205. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2206. jid, self.hostB), n=10,
  2207. regexp=True,
  2208. existence=False, max_attempts=5, interval=1)
  2209. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2210. jid, self.hostC), n=10,
  2211. regexp=True,
  2212. existence=False, max_attempts=5, interval=1)
  2213. # momB and momC's hosts will not get DELETE_JOB2 request since
  2214. # not all their vnodes have been released yet from the job.
  2215. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2216. existence=False, max_attempts=5, interval=1)
  2217. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2218. existence=False, max_attempts=5, interval=1)
  2219. # Verify remaining job resources.
  2220. sel_esc = self.job1_extra_res_select.replace("+", "\+")
  2221. exec_host_esc = self.job1_extra_res_exec_host.replace(
  2222. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2223. exec_vnode_esc = \
  2224. self.job1_extra_res_exec_vnode.replace(
  2225. "[", "\[").replace(
  2226. "]", "\]").replace("(", "\(").replace(")", "\)").replace(
  2227. "+", "\+")
  2228. newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
  2229. "1:mem=1048576kb:ncpus=2:mpiprocs=3:ompthreads=3+" + \
  2230. "1:ncpus=2:mem=2097152kb:mpiprocs=2:ompthreads=2"
  2231. newsel_esc = newsel.replace("+", "\+")
  2232. new_exec_host = self.job1_extra_res_exec_host
  2233. new_exec_host_esc = self.job1_extra_res_exec_host.replace(
  2234. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2235. new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
  2236. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  2237. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  2238. "]", "\]").replace(
  2239. "(", "\(").replace(")", "\)").replace("+", "\+")
  2240. self.server.expect(JOB,
  2241. {'job_state': 'R',
  2242. 'Resource_List.mem': '5gb',
  2243. 'Resource_List.ncpus': 7,
  2244. 'Resource_List.select': newsel,
  2245. 'Resource_List.place': self.job1_extra_res_place,
  2246. 'Resource_List.nodect': 3,
  2247. 'schedselect': newsel,
  2248. 'exec_host': new_exec_host,
  2249. 'exec_vnode': new_exec_vnode}, id=jid)
  2250. # Though the job is listed with ncpus=7 taking away released vnode
  2251. # <n4>, it's coming from a super-chunk where other vnodes <n5> and
  2252. # <n6> are still assigned to the job. So the parent mom of <n4>
  2253. # till won't release the job and thus, the 1 license for it is still
  2254. # allocated.
  2255. self.license_count_match(8)
  2256. # Check account update ('u') record
  2257. self.match_accounting_log('u', jid, exec_host_esc,
  2258. exec_vnode_esc, "6gb", 8, 3,
  2259. self.job1_extra_res_place,
  2260. sel_esc)
  2261. # Check to make sure 'c' (next) record got generated
  2262. self.match_accounting_log('c', jid, new_exec_host_esc,
  2263. new_exec_vnode_esc, "5242880kb",
  2264. 7, 3, self.job1_extra_res_place, newsel_esc)
  2265. # Check various vnode status.
  2266. jobs_assn1 = "%s/0" % (jid,)
  2267. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2268. 'job-busy', jobs_assn1, 1, '1048576kb')
  2269. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2270. '0kb')
  2271. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2272. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2273. 2, '2097152kb')
  2274. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2275. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2276. 'resources_assigned.mem': '6291456kb'})
  2277. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2278. 'resources_assigned.mem': '6291456kb'},
  2279. id="workq")
  2280. self.assertTrue(
  2281. self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
  2282. self.server.delete(jid)
  2283. # Check account phased end ('e') record
  2284. self.match_accounting_log('e', jid, new_exec_host_esc,
  2285. new_exec_vnode_esc,
  2286. "5242880kb", 7, 3,
  2287. self.job1_extra_res_place,
  2288. newsel_esc)
  2289. # Check to make sure 'E' (end of job) record got generated
  2290. self.match_accounting_log('E', jid, exec_host_esc,
  2291. exec_vnode_esc, "6gb",
  2292. 8, 3, self.job1_extra_res_place,
  2293. sel_esc)
  2294. @timeout(400)
  2295. def test_release_nodes2(self):
  2296. """
  2297. Test:
  2298. Given: a job that has been submitted with a select spec
  2299. of 2 super-chunks of ncpus=3 and mem=2gb each,
  2300. and 1 chunk of ncpus=2 and mem=2gb, along with
  2301. place spec of "scatter", resulting in an
  2302. exec_vnode=
  2303. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  2304. Executing pbs_release_nodes -j <job-id> <n5>
  2305. results in:
  2306. 1. node <n5> no longer appearing in job's
  2307. exec_vnode value,
  2308. 2. resources associated with the
  2309. node are taken out of job's Resources_List.*,
  2310. schedselect values,
  2311. 3. Since node <n5> is just one of the vnodes in the
  2312. host assigned to the second super-chunk, the node
  2313. still won't accept new jobs until all the other
  2314. allocated vnodes from the same mom host are released.
  2315. The resources then assigned to the job from
  2316. node <n5> continues to be assigned including
  2317. corresponding licenses.
  2318. NOTE: This is testing to make sure the position of <n5>
  2319. in the exec_vnode string (middle of a super-chunk) will
  2320. not break the recreation of the attribute value after
  2321. release.
  2322. """
  2323. jid = self.create_and_submit_job('job1_5')
  2324. self.server.expect(JOB, {'job_state': 'R',
  2325. 'Resource_List.mem': '6gb',
  2326. 'Resource_List.ncpus': 8,
  2327. 'Resource_List.nodect': 3,
  2328. 'Resource_List.select': self.job1_select,
  2329. 'Resource_List.place': self.job1_place,
  2330. 'schedselect': self.job1_schedselect,
  2331. 'exec_host': self.job1_exec_host,
  2332. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  2333. # server's license_count used value matches job's 'ncpus' value.
  2334. self.license_count_match(8)
  2335. # Check various vnode status.
  2336. jobs_assn1 = "%s/0" % (jid,)
  2337. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2338. 'job-busy', jobs_assn1, 1, '1048576kb')
  2339. self.match_vnode_status([self.n3, self.n6],
  2340. 'job-busy', jobs_assn1, 1, '0kb')
  2341. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2342. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2343. 2, '2097152kb')
  2344. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2345. self.assertTrue(
  2346. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  2347. # Run pbs_release_nodes as root
  2348. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5]
  2349. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2350. sudo=True)
  2351. self.assertEqual(ret['rc'], 0)
  2352. # Verify mom_logs
  2353. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2354. jid, self.hostB), n=10,
  2355. regexp=True,
  2356. existence=False, max_attempts=5, interval=1)
  2357. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2358. jid, self.hostC), n=10,
  2359. regexp=True,
  2360. existence=False, max_attempts=5, interval=1)
  2361. # momB and momC's hosts will not get DELETE_JOB2 request since
  2362. # not all their vnodes have been released yet from the job.
  2363. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2364. existence=False, max_attempts=5, interval=1)
  2365. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2366. existence=False, max_attempts=5, interval=1)
  2367. # Verify remaining job resources.
  2368. exec_host_esc = self.job1_exec_host.replace(
  2369. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2370. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  2371. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  2372. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  2373. "1:ncpus=2:mem=2097152kb"
  2374. newsel_esc = newsel.replace("+", "\+")
  2375. new_exec_host = self.job1_exec_host
  2376. new_exec_host_esc = self.job1_exec_host.replace(
  2377. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2378. new_exec_vnode = self.job1_exec_vnode.replace(
  2379. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  2380. new_exec_vnode_esc = new_exec_vnode.replace(
  2381. "[", "\[").replace("]", "\]").replace(
  2382. "(", "\(").replace(")", "\)").replace("+", "\+")
  2383. self.server.expect(JOB, {'job_state': 'R',
  2384. 'Resource_List.mem': '5gb',
  2385. 'Resource_List.ncpus': 7,
  2386. 'Resource_List.select': newsel,
  2387. 'Resource_List.place': self.job1_place,
  2388. 'Resource_List.nodect': 3,
  2389. 'schedselect': newsel,
  2390. 'exec_host': new_exec_host,
  2391. 'exec_vnode': new_exec_vnode}, id=jid)
  2392. # Though the job is listed with ncpus=7 taking away released vnode
  2393. # <n5>, it's coming from a super-chunk where other vnodes <n4> and
  2394. # <n6> are still assigned to the job. So the parent mom of <n5>
  2395. # till won't release the job and thus, the 1 license for it is still
  2396. # allocated.
  2397. self.license_count_match(8)
  2398. # Check account update ('u') record
  2399. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  2400. self.job1_exec_vnode_esc, "6gb", 8, 3,
  2401. self.job1_place,
  2402. self.job1_sel_esc)
  2403. # Check to make sure 'c' (next) record got generated
  2404. self.match_accounting_log('c', jid, self.job1_exec_host_esc,
  2405. new_exec_vnode_esc, "5242880kb",
  2406. 7, 3, self.job1_place, newsel_esc)
  2407. # Check various vnode status.
  2408. jobs_assn1 = "%s/0" % (jid,)
  2409. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2410. 'job-busy', jobs_assn1, 1, '1048576kb')
  2411. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2412. '0kb')
  2413. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2414. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2415. 2, '2097152kb')
  2416. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2417. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2418. 'resources_assigned.mem': '6291456kb'})
  2419. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2420. 'resources_assigned.mem': '6291456kb'},
  2421. id="workq")
  2422. self.assertTrue(
  2423. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  2424. self.server.delete(jid)
  2425. # Check account phased end ('e') record
  2426. self.match_accounting_log('e', jid, new_exec_host_esc,
  2427. new_exec_vnode_esc,
  2428. "5242880kb", 7, 3,
  2429. self.job1_place,
  2430. newsel_esc)
  2431. # Check to make sure 'E' (end of job) record got generated
  2432. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  2433. self.job1_exec_vnode_esc, "6gb",
  2434. 8, 3, self.job1_place, self.job1_sel_esc)
  2435. def test_release_nodes2_extra(self):
  2436. """
  2437. Test:
  2438. Like test_release_nodes2 except instead of the super-chunk
  2439. and chunks getting only ncpus and mem values, additional
  2440. resources mpiprocs and ompthreads are also requested and
  2441. assigned:
  2442. For example:
  2443. qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
  2444. ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
  2445. ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  2446. We want to make sure the ompthreads and mpiprocs values are
  2447. preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
  2448. the host names are duplicated according to the number of
  2449. mpiprocs. For example, if <n1> is assigned to first
  2450. chunk, with mpiprocs=3, <n1> will appear 3 times in
  2451. $PBS_NODEFILE.
  2452. """
  2453. jid = self.create_and_submit_job('job1_extra_res')
  2454. self.server.expect(JOB, {'job_state': 'R',
  2455. 'Resource_List.mem': '6gb',
  2456. 'Resource_List.ncpus': 8,
  2457. 'Resource_List.nodect': 3,
  2458. 'Resource_List.select':
  2459. self.job1_extra_res_select,
  2460. 'Resource_List.place':
  2461. self.job1_extra_res_place,
  2462. 'schedselect':
  2463. self.job1_extra_res_schedselect,
  2464. 'exec_host':
  2465. self.job1_extra_res_exec_host,
  2466. 'exec_vnode':
  2467. self.job1_extra_res_exec_vnode}, id=jid)
  2468. # server's license_count used value matches job's 'ncpus' value.
  2469. self.license_count_match(8)
  2470. # Check various vnode status.
  2471. jobs_assn1 = "%s/0" % (jid,)
  2472. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2473. 'job-busy', jobs_assn1, 1, '1048576kb')
  2474. self.match_vnode_status([self.n3, self.n6],
  2475. 'job-busy', jobs_assn1, 1, '0kb')
  2476. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2477. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2478. 2, '2097152kb')
  2479. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2480. # inside pbs_nodefile_match_exec_host() function, takes care of
  2481. # verifying that the host names appear according to the number of
  2482. # mpiprocs assigned to the chunk.
  2483. self.assertTrue(
  2484. self.pbs_nodefile_match_exec_host(
  2485. jid, self.job1_extra_res_exec_host,
  2486. self.job1_extra_res_schedselect))
  2487. # Run pbs_release_nodes as root
  2488. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5]
  2489. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2490. sudo=True)
  2491. self.assertEqual(ret['rc'], 0)
  2492. # Verify mom_logs
  2493. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2494. jid, self.hostB), n=10,
  2495. regexp=True,
  2496. existence=False, max_attempts=5, interval=1)
  2497. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2498. jid, self.hostC), n=10,
  2499. regexp=True,
  2500. existence=False, max_attempts=5, interval=1)
  2501. # momB and momC's hosts will not get DELETE_JOB2 request since
  2502. # not all their vnodes have been released yet from the job.
  2503. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2504. existence=False, max_attempts=5, interval=1)
  2505. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2506. existence=False, max_attempts=5, interval=1)
  2507. # Verify remaining job resources.
  2508. sel_esc = self.job1_extra_res_select.replace("+", "\+")
  2509. exec_host_esc = self.job1_extra_res_exec_host.replace(
  2510. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2511. exec_vnode_esc = self.job1_extra_res_exec_vnode.replace(
  2512. "[", "\[").replace("]", "\]").replace(
  2513. "(", "\(").replace(")", "\)").replace("+", "\+")
  2514. newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
  2515. "1:mem=1048576kb:ncpus=2:mpiprocs=3:ompthreads=3+" + \
  2516. "1:ncpus=2:mem=2097152kb:mpiprocs=2:ompthreads=2"
  2517. newsel_esc = newsel.replace("+", "\+")
  2518. new_exec_host = self.job1_extra_res_exec_host
  2519. new_exec_host_esc = self.job1_extra_res_exec_host.replace(
  2520. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2521. new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
  2522. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  2523. new_exec_vnode_esc = new_exec_vnode.replace(
  2524. "[", "\[").replace("]", "\]").replace(
  2525. "(", "\(").replace(")", "\)").replace("+", "\+")
  2526. self.server.expect(JOB,
  2527. {'job_state': 'R',
  2528. 'Resource_List.mem': '5gb',
  2529. 'Resource_List.ncpus': 7,
  2530. 'Resource_List.select': newsel,
  2531. 'Resource_List.place': self.job1_extra_res_place,
  2532. 'Resource_List.nodect': 3,
  2533. 'schedselect': newsel,
  2534. 'exec_host': new_exec_host,
  2535. 'exec_vnode': new_exec_vnode}, id=jid)
  2536. # Though the job is listed with ncpus=7 taking away released vnode
  2537. # <n5>, it's coming from a super-chunk where other vnodes <n4> and
  2538. # <n6> are still assigned to the job. So the parent mom of <n5>
  2539. # till won't release the job and thus, the 1 license for it is still
  2540. # allocated.
  2541. self.license_count_match(8)
  2542. # Check account update ('u') record
  2543. self.match_accounting_log('u', jid, exec_host_esc,
  2544. exec_vnode_esc, "6gb", 8, 3,
  2545. self.job1_extra_res_place,
  2546. sel_esc)
  2547. # Check to make sure 'c' (next) record got generated
  2548. self.match_accounting_log('c', jid, new_exec_host_esc,
  2549. new_exec_vnode_esc, "5242880kb",
  2550. 7, 3, self.job1_extra_res_place, newsel_esc)
  2551. # Check various vnode status.
  2552. jobs_assn1 = "%s/0" % (jid,)
  2553. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2554. 'job-busy', jobs_assn1, 1, '1048576kb')
  2555. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2556. '0kb')
  2557. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2558. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2559. 2, '2097152kb')
  2560. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2561. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2562. 'resources_assigned.mem': '6291456kb'})
  2563. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2564. 'resources_assigned.mem': '6291456kb'},
  2565. id="workq")
  2566. self.assertTrue(
  2567. self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
  2568. self.server.delete(jid)
  2569. # Check account phased end ('e') record
  2570. self.match_accounting_log('e', jid, new_exec_host_esc,
  2571. new_exec_vnode_esc,
  2572. "5242880kb", 7, 3,
  2573. self.job1_extra_res_place,
  2574. newsel_esc)
  2575. # Check to make sure 'E' (end of job) record got generated
  2576. self.match_accounting_log('E', jid, exec_host_esc,
  2577. exec_vnode_esc, "6gb",
  2578. 8, 3, self.job1_extra_res_place,
  2579. sel_esc)
  2580. @timeout(400)
  2581. def test_release_nodes3(self):
  2582. """
  2583. Test:
  2584. Given: a job that has been submitted with a select spec
  2585. of 2 super-chunks of ncpus=3 and mem=2gb each,
  2586. and 1 chunk of ncpus=2 and mem=2gb, along with
  2587. place spec of "scatter", resulting in an
  2588. exec_vnode=
  2589. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  2590. Executing pbs_release_nodes -j <job-id> <n6>
  2591. results in:
  2592. 1. node <n6> no longer appearing in job's
  2593. exec_vnode value,
  2594. 2. resources associated with the
  2595. node are taken out of job's Resources_List.*,
  2596. schedselect values,
  2597. 3. Since node <n6> is just one of the vnodes in the
  2598. host assigned to the second super-chunk, the node
  2599. still won't accept new jobs until all the other
  2600. allocated vnodes from the same mom host are released.
  2601. The resources then assigned to the job from
  2602. node <n6> continues to be assigned including
  2603. corresponding licenses.
  2604. NOTE: This is testing to make sure the position of <n6>
  2605. in the exec_vnode string (right end of a super-chunk) will
  2606. not break the recreation of the attribute value after
  2607. release.
  2608. """
  2609. jid = self.create_and_submit_job('job1_5')
  2610. self.server.expect(JOB, {'job_state': 'R',
  2611. 'Resource_List.mem': '6gb',
  2612. 'Resource_List.ncpus': 8,
  2613. 'Resource_List.nodect': 3,
  2614. 'Resource_List.select': self.job1_select,
  2615. 'Resource_List.place': self.job1_place,
  2616. 'schedselect': self.job1_schedselect,
  2617. 'exec_host': self.job1_exec_host,
  2618. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  2619. # server's license_count used value matches job's 'ncpus' value.
  2620. self.license_count_match(8)
  2621. # Check various vnode status.
  2622. jobs_assn1 = "%s/0" % (jid,)
  2623. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2624. 'job-busy', jobs_assn1, 1, '1048576kb')
  2625. self.match_vnode_status([self.n3, self.n6],
  2626. 'job-busy', jobs_assn1, 1, '0kb')
  2627. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2628. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2629. 2, '2097152kb')
  2630. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2631. self.assertTrue(
  2632. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  2633. # Run pbs_release_nodes as root
  2634. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n6]
  2635. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2636. sudo=True)
  2637. self.assertEqual(ret['rc'], 0)
  2638. # Verify mom_logs
  2639. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2640. jid, self.hostB), n=10,
  2641. regexp=True,
  2642. existence=False, max_attempts=5, interval=1)
  2643. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2644. jid, self.hostC), n=10,
  2645. regexp=True,
  2646. existence=False, max_attempts=5, interval=1)
  2647. # momB and momC's hosts will not get DELETE_JOB2 request since
  2648. # not all their vnodes have been released yet from the job.
  2649. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2650. existence=False, max_attempts=5, interval=1)
  2651. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2652. existence=False, max_attempts=5, interval=1)
  2653. # Verify remaining job resources.
  2654. exec_host_esc = self.job1_exec_host.replace(
  2655. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2656. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  2657. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  2658. newsel = "1:mem=2097152kb:ncpus=3+1:mem=2097152kb:ncpus=2+" + \
  2659. "1:ncpus=2:mem=2097152kb"
  2660. newsel_esc = newsel.replace("+", "\+")
  2661. new_exec_host = self.job1_exec_host
  2662. new_exec_host_esc = self.job1_exec_host.replace(
  2663. "*", "\*").replace("[", "\[").replace("]", "\]").replace(
  2664. "+", "\+")
  2665. new_exec_vnode = self.job1_exec_vnode.replace(
  2666. "+%s:ncpus=1" % (self.n6,), "")
  2667. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  2668. "]", "\]").replace(
  2669. "(", "\(").replace(")", "\)").replace("+", "\+")
  2670. self.server.expect(JOB, {'job_state': 'R',
  2671. 'Resource_List.mem': '6gb',
  2672. 'Resource_List.ncpus': 7,
  2673. 'Resource_List.select': newsel,
  2674. 'Resource_List.place': self.job1_place,
  2675. 'Resource_List.nodect': 3,
  2676. 'schedselect': newsel,
  2677. 'exec_host': new_exec_host,
  2678. 'exec_vnode': new_exec_vnode}, id=jid)
  2679. # Though the job is listed with ncpus=7 taking away released vnode
  2680. # <n6>, it's coming from a super-chunk where other vnodes <n4> and
  2681. # <n5> are still # assigned to the job. So the parent mom of <n6>
  2682. # till won't release the job and thus, the 1 license for it is still
  2683. # allocated.
  2684. self.license_count_match(8)
  2685. # Check account update ('u') record
  2686. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  2687. self.job1_exec_vnode_esc, "6gb", 8, 3,
  2688. self.job1_place,
  2689. self.job1_sel_esc)
  2690. # Check to make sure 'c' (next) record got generated
  2691. self.match_accounting_log('c', jid, self.job1_exec_host_esc,
  2692. new_exec_vnode_esc, "6291456kb",
  2693. 7, 3, self.job1_place, newsel_esc)
  2694. # Check various vnode status.
  2695. jobs_assn1 = "%s/0" % (jid,)
  2696. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2697. 'job-busy', jobs_assn1, 1, '1048576kb')
  2698. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2699. '0kb')
  2700. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2701. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2702. 2, '2097152kb')
  2703. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2704. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2705. 'resources_assigned.mem': '6291456kb'})
  2706. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2707. 'resources_assigned.mem': '6291456kb'},
  2708. id="workq")
  2709. self.assertTrue(
  2710. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  2711. self.server.delete(jid)
  2712. # Check account phased end ('e') record
  2713. self.match_accounting_log('e', jid, new_exec_host_esc,
  2714. new_exec_vnode_esc,
  2715. "6291456kb", 7, 3,
  2716. self.job1_place,
  2717. newsel_esc)
  2718. # Check to make sure 'E' (end of job) record got generated
  2719. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  2720. self.job1_exec_vnode_esc, "6gb",
  2721. 8, 3, self.job1_place, self.job1_sel_esc)
  2722. @timeout(400)
  2723. def test_release_nodes3_extra(self):
  2724. """
  2725. Test:
  2726. Like test_release_nodes3 except instead of the super-chunk
  2727. and chunks getting only ncpus and mem values, additional
  2728. resources mpiprocs and ompthreads are also requested and
  2729. assigned:
  2730. For example:
  2731. qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
  2732. ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
  2733. ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  2734. We want to make sure the ompthreads and mpiprocs values are
  2735. preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
  2736. the host names are duplicated according to the number of
  2737. mpiprocs. For example, if <n1> is assigned to first
  2738. chunk, with mpiprocs=3, <n1> will appear 3 times in
  2739. $PBS_NODEFILE.
  2740. """
  2741. jid = self.create_and_submit_job('job1_extra_res')
  2742. self.server.expect(JOB, {'job_state': 'R',
  2743. 'Resource_List.mem': '6gb',
  2744. 'Resource_List.ncpus': 8,
  2745. 'Resource_List.nodect': 3,
  2746. 'Resource_List.select':
  2747. self.job1_extra_res_select,
  2748. 'Resource_List.place':
  2749. self.job1_extra_res_place,
  2750. 'schedselect':
  2751. self.job1_extra_res_schedselect,
  2752. 'exec_host':
  2753. self.job1_extra_res_exec_host,
  2754. 'exec_vnode':
  2755. self.job1_extra_res_exec_vnode}, id=jid)
  2756. # server's license_count used value matches job's 'ncpus' value.
  2757. self.license_count_match(8)
  2758. # Check various vnode status.
  2759. jobs_assn1 = "%s/0" % (jid,)
  2760. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2761. 'job-busy', jobs_assn1, 1, '1048576kb')
  2762. self.match_vnode_status([self.n3, self.n6],
  2763. 'job-busy', jobs_assn1, 1, '0kb')
  2764. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2765. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2766. 2, '2097152kb')
  2767. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2768. # inside pbs_nodefile_match_exec_host() function, takes care of
  2769. # verifying that the host names appear according to the number of
  2770. # mpiprocs assigned to the chunk.
  2771. self.assertTrue(
  2772. self.pbs_nodefile_match_exec_host(
  2773. jid, self.job1_extra_res_exec_host,
  2774. self.job1_extra_res_schedselect))
  2775. # Run pbs_release_nodes as root
  2776. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n6]
  2777. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2778. sudo=True)
  2779. self.assertEqual(ret['rc'], 0)
  2780. # Verify mom_logs
  2781. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2782. jid, self.hostB), n=10,
  2783. regexp=True,
  2784. existence=False, max_attempts=5, interval=1)
  2785. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2786. jid, self.hostC), n=10,
  2787. regexp=True,
  2788. existence=False, max_attempts=5, interval=1)
  2789. # momB and momC's hosts will not get DELETE_JOB2 request since
  2790. # not all their vnodes have been released yet from the job.
  2791. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2792. existence=False, max_attempts=5, interval=1)
  2793. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2794. existence=False, max_attempts=5, interval=1)
  2795. # Verify remaining job resources.
  2796. sel_esc = self.job1_extra_res_select.replace("+", "\+")
  2797. exec_host_esc = self.job1_extra_res_exec_host.replace(
  2798. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2799. exec_vnode_esc = self.job1_extra_res_exec_vnode.replace(
  2800. "[", "\[").replace(
  2801. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  2802. newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
  2803. "1:mem=2097152kb:ncpus=2:mpiprocs=3:ompthreads=3+" + \
  2804. "1:ncpus=2:mem=2097152kb:mpiprocs=2:ompthreads=2"
  2805. newsel_esc = newsel.replace("+", "\+")
  2806. new_exec_host = self.job1_extra_res_exec_host
  2807. new_exec_host_esc = self.job1_extra_res_exec_host.replace(
  2808. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2809. new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
  2810. "+%s:ncpus=1" % (self.n6,), "")
  2811. new_exec_vnode_esc = new_exec_vnode.replace(
  2812. "[", "\[").replace("]", "\]").replace(
  2813. "(", "\(").replace(")", "\)").replace("+", "\+")
  2814. self.server.expect(JOB,
  2815. {'job_state': 'R',
  2816. 'Resource_List.mem': '6gb',
  2817. 'Resource_List.ncpus': 7,
  2818. 'Resource_List.select': newsel,
  2819. 'Resource_List.place':
  2820. self.job1_extra_res_place,
  2821. 'Resource_List.nodect': 3,
  2822. 'schedselect': newsel,
  2823. 'exec_host': new_exec_host,
  2824. 'exec_vnode': new_exec_vnode}, id=jid)
  2825. # Though the job is listed with ncpus=7 taking away released vnode
  2826. # <n6>, it's coming from a super-chunk where other vnodes <n4> and
  2827. # <n5> are still assigned to the job. So the parent mom of <n6>
  2828. # till won't release the job and thus, the 1 license for it is still
  2829. # allocated.
  2830. self.license_count_match(8)
  2831. # Check account update ('u') record
  2832. self.match_accounting_log('u', jid, exec_host_esc,
  2833. exec_vnode_esc, "6gb", 8, 3,
  2834. self.job1_extra_res_place,
  2835. sel_esc)
  2836. # Check to make sure 'c' (next) record got generated
  2837. self.match_accounting_log('c', jid, new_exec_host_esc,
  2838. new_exec_vnode_esc, "6291456kb",
  2839. 7, 3, self.job1_extra_res_place, newsel_esc)
  2840. # Check various vnode status.
  2841. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2842. 'job-busy', jobs_assn1, 1, '1048576kb')
  2843. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2844. '0kb')
  2845. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2846. 2, '2097152kb')
  2847. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2848. 'resources_assigned.mem': '6291456kb'})
  2849. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2850. 'resources_assigned.mem': '6291456kb'},
  2851. id="workq")
  2852. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2853. self.assertTrue(
  2854. self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
  2855. self.server.delete(jid)
  2856. # Check account phased end ('e') record
  2857. self.match_accounting_log('e', jid, new_exec_host_esc,
  2858. new_exec_vnode_esc,
  2859. "6291456kb", 7, 3,
  2860. self.job1_extra_res_place,
  2861. newsel_esc)
  2862. # Check to make sure 'E' (end of job) record got generated
  2863. self.match_accounting_log('E', jid, exec_host_esc,
  2864. exec_vnode_esc, "6gb",
  2865. 8, 3, self.job1_extra_res_place,
  2866. sel_esc)
  2867. def test_release_nodes4(self):
  2868. """
  2869. Test:
  2870. Given: a job that has been submitted with a select spec
  2871. of 2 super-chunks of ncpus=3 and mem=2gb each,
  2872. and 1 chunk of ncpus=2 and mem=2gb, along with
  2873. place spec of "scatter", resulting in an
  2874. exec_vnode=
  2875. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  2876. Executing pbs_release_nodes -j <job-id> <n4> <n5> <n7>
  2877. results in:
  2878. 1. node <n4>, <n5>, and <n7> are no longer appearing in
  2879. job's exec_vnode value,
  2880. 2. resources associated with the released
  2881. nodes are taken out of job's Resources_List.*,
  2882. schedselect values,
  2883. 3. Since nodes <n4> and <n5> are some of the vnodes in the
  2884. host assigned to the second super-chunk, the node
  2885. still won't accept new jobs until all the other
  2886. allocated vnodes (<n6>) from the same mom host are
  2887. released.
  2888. 4. The resources then assigned to the job from
  2889. node <n4> and <n5> continue to be assigned including
  2890. corresponding licenses.
  2891. 5. <n7> is the only vnode assigned to the host mapped
  2892. to the third chunk so it's fully deallocated and
  2893. its assigned resources are removed from the job.
  2894. """
  2895. jid = self.create_and_submit_job('job1_5')
  2896. self.server.expect(JOB, {'job_state': 'R',
  2897. 'Resource_List.mem': '6gb',
  2898. 'Resource_List.ncpus': 8,
  2899. 'Resource_List.nodect': 3,
  2900. 'Resource_List.select': self.job1_select,
  2901. 'Resource_List.place': self.job1_place,
  2902. 'schedselect': self.job1_schedselect,
  2903. 'exec_host': self.job1_exec_host,
  2904. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  2905. # server's license_count used value matches job's 'ncpus' value.
  2906. self.license_count_match(8)
  2907. # Check various vnode status.
  2908. jobs_assn1 = "%s/0" % (jid,)
  2909. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2910. 'job-busy', jobs_assn1, 1, '1048576kb')
  2911. self.match_vnode_status([self.n3, self.n6],
  2912. 'job-busy', jobs_assn1, 1, '0kb')
  2913. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2914. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  2915. 2, '2097152kb')
  2916. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  2917. self.assertTrue(
  2918. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  2919. # Run pbs_release_nodes as root
  2920. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  2921. self.n7]
  2922. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  2923. sudo=True)
  2924. self.assertEqual(ret['rc'], 0)
  2925. # momB's host will not get job summary reported but
  2926. # momC's host will get the job summary since all vnodes
  2927. # from the host have been released.
  2928. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2929. jid, self.hostB), n=10, regexp=True, existence=False,
  2930. max_attempts=5, interval=1)
  2931. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  2932. jid, self.hostC), n=10, regexp=True)
  2933. # momB's host will not get DELETE_JOB2 request since
  2934. # not all their vnodes have been released yet from the job.
  2935. # momC's host will get DELETE_JOB2 request since sole vnnode
  2936. # <n7> has been released from the job.
  2937. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  2938. existence=False, max_attempts=5, interval=1)
  2939. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  2940. # Ensure the 'fib' process is gone on hostC when DELETE_JOB request
  2941. # is received
  2942. self.server.pu.get_proc_info(
  2943. self.momC.hostname, ".*fib.*", None, regexp=True)
  2944. self.assertEqual(len(self.server.pu.processes), 0)
  2945. # Verify remaining job resources.
  2946. sel_esc = self.job1_select.replace("+", "\+")
  2947. exec_host_esc = self.job1_exec_host.replace(
  2948. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2949. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  2950. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  2951. newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
  2952. newsel_esc = newsel.replace("+", "\+")
  2953. new_exec_host = self.job1_exec_host.replace(
  2954. "+%s/0*2" % (self.n7,), "")
  2955. new_exec_host_esc = new_exec_host.replace(
  2956. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  2957. new_exec_vnode = self.job1_exec_vnode.replace(
  2958. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  2959. new_exec_vnode = new_exec_vnode.replace(
  2960. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  2961. new_exec_vnode = new_exec_vnode.replace(
  2962. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  2963. new_exec_vnode_esc = new_exec_vnode.replace(
  2964. "[", "\[").replace("]", "\]").replace(
  2965. "(", "\(").replace(")", "\)").replace("+", "\+")
  2966. self.server.expect(JOB, {'job_state': 'R',
  2967. 'Resource_List.mem': '2gb',
  2968. 'Resource_List.ncpus': 4,
  2969. 'Resource_List.select': newsel,
  2970. 'Resource_List.place': self.job1_place,
  2971. 'Resource_List.nodect': 2,
  2972. 'schedselect': newsel,
  2973. 'exec_host': new_exec_host,
  2974. 'exec_vnode': new_exec_vnode}, id=jid)
  2975. # Though the job is listed with ncpus=4 taking away released vnode
  2976. # <n4> (1 cpu), <n5> (1 cpu), <n7> (2 cpus),
  2977. # only <n7> got released. <n4> and <n5> are part of a super
  2978. # chunk that wasn't fully released.
  2979. self.license_count_match(6)
  2980. # Check account update ('u') record
  2981. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  2982. self.job1_exec_vnode_esc, "6gb", 8, 3,
  2983. self.job1_place,
  2984. self.job1_sel_esc)
  2985. # Check to make sure 'c' (next) record got generated
  2986. self.match_accounting_log('c', jid, new_exec_host_esc,
  2987. new_exec_vnode_esc, "2097152kb",
  2988. 4, 2, self.job1_place, newsel_esc)
  2989. # Check various vnode status.
  2990. jobs_assn1 = "%s/0" % (jid,)
  2991. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  2992. 'job-busy', jobs_assn1, 1, '1048576kb')
  2993. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  2994. '0kb')
  2995. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  2996. 'free')
  2997. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  2998. 'resources_assigned.mem': '4194304kb'})
  2999. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  3000. 'resources_assigned.mem': '4194304kb'},
  3001. id="workq")
  3002. self.assertTrue(
  3003. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  3004. self.server.delete(jid)
  3005. # Check account phased end ('e') record
  3006. self.match_accounting_log('e', jid, new_exec_host_esc,
  3007. new_exec_vnode_esc,
  3008. "2097152kb", 4, 2,
  3009. self.job1_place,
  3010. newsel_esc)
  3011. # Check to make sure 'E' (end of job) record got generated
  3012. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  3013. self.job1_exec_vnode_esc, "6gb",
  3014. 8, 3, self.job1_place, self.job1_sel_esc)
  3015. def test_release_nodes4_extra(self):
  3016. """
  3017. Test:
  3018. Like test_release_nodes4 except instead of the super-chunk
  3019. and chunks getting only ncpus and mem values, additional
  3020. resources mpiprocs and ompthreads are also requested and
  3021. assigned:
  3022. For example:
  3023. qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
  3024. ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
  3025. ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  3026. We want to make sure the ompthreads and mpiprocs values are
  3027. preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
  3028. the host names are duplicated according to the number of
  3029. mpiprocs. For example, if <n1> is assigned to first
  3030. chunk, with mpiprocs=3, <n1> will appear 3 times in
  3031. $PBS_NODEFILE.
  3032. """
  3033. jid = self.create_and_submit_job('job1_extra_res')
  3034. self.server.expect(JOB, {'job_state': 'R',
  3035. 'Resource_List.mem': '6gb',
  3036. 'Resource_List.ncpus': 8,
  3037. 'Resource_List.nodect': 3,
  3038. 'Resource_List.select':
  3039. self.job1_extra_res_select,
  3040. 'Resource_List.place':
  3041. self.job1_extra_res_place,
  3042. 'schedselect':
  3043. self.job1_extra_res_schedselect,
  3044. 'exec_host':
  3045. self.job1_extra_res_exec_host,
  3046. 'exec_vnode':
  3047. self.job1_extra_res_exec_vnode}, id=jid)
  3048. # server's license_count used value matches job's 'ncpus' value.
  3049. self.license_count_match(8)
  3050. # Check various vnode status.
  3051. jobs_assn1 = "%s/0" % (jid,)
  3052. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3053. 'job-busy', jobs_assn1, 1, '1048576kb')
  3054. self.match_vnode_status([self.n3, self.n6],
  3055. 'job-busy', jobs_assn1, 1, '0kb')
  3056. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3057. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3058. 2, '2097152kb')
  3059. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3060. # inside pbs_nodefile_match_exec_host() function, takes care of
  3061. # verifying that the host names appear according to the number of
  3062. # mpiprocs assigned to the chunk.
  3063. self.assertTrue(
  3064. self.pbs_nodefile_match_exec_host(
  3065. jid, self.job1_extra_res_exec_host,
  3066. self.job1_extra_res_schedselect))
  3067. # Run pbs_release_nodes as root
  3068. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  3069. self.n7]
  3070. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3071. sudo=True)
  3072. self.assertEqual(ret['rc'], 0)
  3073. # momB's host will not get job summary reported but
  3074. # momC's host will get the job summary since all vnodes
  3075. # from the host have been released.
  3076. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3077. jid, self.hostB), n=10, regexp=True, existence=False,
  3078. max_attempts=5, interval=1)
  3079. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3080. jid, self.hostC), n=10, regexp=True)
  3081. # momB's host will not get DELETE_JOB2 request since
  3082. # not all their vnodes have been released yet from the job.
  3083. # momC will get DELETE_JOB2 request since sole vnode
  3084. # <n7> has been released from the job.
  3085. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3086. existence=False, max_attempts=5, interval=1)
  3087. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  3088. # Ensure the 'fib' process is gone from hostC when DELETE_JOB request
  3089. # received
  3090. self.server.pu.get_proc_info(
  3091. self.momC.hostname, ".*fib.*", None, regexp=True)
  3092. self.assertEqual(len(self.server.pu.processes), 0)
  3093. # Verify remaining job resources.
  3094. sel_esc = self.job1_extra_res_select.replace("+", "\+")
  3095. exec_host_esc = self.job1_extra_res_exec_host.replace(
  3096. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3097. exec_vnode_esc = self.job1_extra_res_exec_vnode.replace(
  3098. "[", "\[").replace(
  3099. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  3100. newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
  3101. "1:ncpus=1:mpiprocs=3:ompthreads=3"
  3102. newsel_esc = newsel.replace("+", "\+")
  3103. new_exec_host = self.job1_extra_res_exec_host.replace(
  3104. "+%s/0*2" % (self.n7,), "")
  3105. new_exec_host_esc = new_exec_host.replace(
  3106. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3107. new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
  3108. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  3109. new_exec_vnode = new_exec_vnode.replace(
  3110. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  3111. new_exec_vnode = new_exec_vnode.replace(
  3112. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  3113. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  3114. "]", "\]").replace(
  3115. "(", "\(").replace(")", "\)").replace("+", "\+")
  3116. self.server.expect(JOB, {'job_state': 'R',
  3117. 'Resource_List.mem': '2gb',
  3118. 'Resource_List.ncpus': 4,
  3119. 'Resource_List.select': newsel,
  3120. 'Resource_List.place':
  3121. self.job1_extra_res_place,
  3122. 'Resource_List.nodect': 2,
  3123. 'schedselect': newsel,
  3124. 'exec_host': new_exec_host,
  3125. 'exec_vnode': new_exec_vnode}, id=jid)
  3126. # Though the job is listed with ncpus=4 taking away released vnode
  3127. # <n4> (1 cpu), <n5> (1 cpu), <n7> (2 cpus),
  3128. # only <n7> got released. <n4> and <n5> are part of a super
  3129. # chunk that wasn't fully released.
  3130. self.license_count_match(6)
  3131. # Check account update ('u') record
  3132. self.match_accounting_log('u', jid, exec_host_esc,
  3133. exec_vnode_esc, "6gb", 8, 3,
  3134. self.job1_extra_res_place,
  3135. sel_esc)
  3136. # Check to make sure 'c' (next) record got generated
  3137. self.match_accounting_log('c', jid, new_exec_host_esc,
  3138. new_exec_vnode_esc, "2097152kb",
  3139. 4, 2, self.job1_extra_res_place, newsel_esc)
  3140. # Check various vnode status.
  3141. jobs_assn1 = "%s/0" % (jid,)
  3142. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3143. 'job-busy', jobs_assn1, 1, '1048576kb')
  3144. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  3145. '0kb')
  3146. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  3147. 'free')
  3148. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  3149. 'resources_assigned.mem': '4194304kb'})
  3150. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  3151. 'resources_assigned.mem': '4194304kb'},
  3152. id="workq")
  3153. self.assertTrue(
  3154. self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
  3155. self.server.delete(jid)
  3156. # Check account phased end ('e') record
  3157. self.match_accounting_log('e', jid, new_exec_host_esc,
  3158. new_exec_vnode_esc,
  3159. "2097152kb", 4, 2,
  3160. self.job1_extra_res_place,
  3161. newsel_esc)
  3162. # Check to make sure 'E' (end of job) record got generated
  3163. self.match_accounting_log('E', jid, exec_host_esc,
  3164. exec_vnode_esc, "6gb",
  3165. 8, 3, self.job1_extra_res_place,
  3166. sel_esc)
  3167. def test_release_nodes5(self):
  3168. """
  3169. Test:
  3170. Given: a job that has been submitted with a select spec
  3171. of 2 super-chunks of ncpus=3 and mem=2gb each,
  3172. and 1 chunk of ncpus=2 and mem=2gb, along with
  3173. place spec of "scatter", resulting in an
  3174. exec_vnode=
  3175. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  3176. Executing pbs_release_nodes -j <job-id> <n5> <n6> <n7>
  3177. results in:
  3178. 1. node <n5>, <n6>, and <n7> are no longer appearing in
  3179. job's exec_vnode value,
  3180. 2. resources associated with the released
  3181. nodes are taken out of job's Resources_List.*,
  3182. schedselect values,
  3183. 3. Since nodes <n5> and <n6> are some of the vnodes in the
  3184. host assigned to the second super-chunk, the node
  3185. still won't accept new jobs until all the other
  3186. allocated vnodes (<n4>) from the same mom host are
  3187. released.
  3188. 4. The resources then assigned to the job from
  3189. node <n5> and <n6> continue to be assigned including
  3190. corresponding licenses.
  3191. 5. <n7> is the only vnode assigned to the host mapped
  3192. to the third chunk so it's fully deallocated and
  3193. its assigned resources are removed from the job.
  3194. """
  3195. jid = self.create_and_submit_job('job1_5')
  3196. self.server.expect(JOB, {'job_state': 'R',
  3197. 'Resource_List.mem': '6gb',
  3198. 'Resource_List.ncpus': 8,
  3199. 'Resource_List.nodect': 3,
  3200. 'Resource_List.select': self.job1_select,
  3201. 'Resource_List.place': self.job1_place,
  3202. 'schedselect': self.job1_schedselect,
  3203. 'exec_host': self.job1_exec_host,
  3204. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  3205. # server's license_count used value matches job's 'ncpus' value.
  3206. self.license_count_match(8)
  3207. # Check various vnode status.
  3208. jobs_assn1 = "%s/0" % (jid,)
  3209. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3210. 'job-busy', jobs_assn1, 1, '1048576kb')
  3211. self.match_vnode_status([self.n3, self.n6],
  3212. 'job-busy', jobs_assn1, 1, '0kb')
  3213. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3214. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3215. 2, '2097152kb')
  3216. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3217. self.assertTrue(
  3218. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  3219. # Run pbs_release_nodes as root
  3220. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6,
  3221. self.n7]
  3222. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3223. sudo=True)
  3224. self.assertEqual(ret['rc'], 0)
  3225. # momB's host will not get job summary reported but
  3226. # momC's host will get the job summary since all vnodes
  3227. # from the host have been released.
  3228. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3229. jid, self.hostB), n=10, regexp=True, existence=False,
  3230. max_attempts=5, interval=1)
  3231. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3232. jid, self.hostC), n=10, regexp=True)
  3233. # momB's host will not get DELETE_JOB2 request since
  3234. # not all their vnodes have been released yet from the job.
  3235. # momC will get DELETE_JOB2 request since sole vnode
  3236. # <n7> has been released from the job.
  3237. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3238. existence=False, max_attempts=5, interval=1)
  3239. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  3240. # Ensure the 'fib' process is gone from hostC when DELETE_JOB request
  3241. # received
  3242. self.server.pu.get_proc_info(
  3243. self.momC.hostname, ".*fib.*", None, regexp=True)
  3244. self.assertEqual(len(self.server.pu.processes), 0)
  3245. # Verify remaining job resources.
  3246. sel_esc = self.job1_select.replace("+", "\+")
  3247. exec_host_esc = self.job1_exec_host.replace(
  3248. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3249. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  3250. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  3251. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=1"
  3252. newsel_esc = newsel.replace("+", "\+")
  3253. new_exec_host = self.job1_exec_host.replace(
  3254. "+%s/0*2" % (self.n7,), "")
  3255. new_exec_host_esc = new_exec_host.replace(
  3256. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3257. new_exec_vnode = self.job1_exec_vnode.replace(
  3258. "+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
  3259. new_exec_vnode = new_exec_vnode.replace(
  3260. "+%s:ncpus=1" % (self.n6,), "")
  3261. new_exec_vnode = new_exec_vnode.replace(
  3262. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  3263. new_exec_vnode_esc = \
  3264. new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
  3265. "(", "\(").replace(")", "\)").replace("+", "\+")
  3266. self.server.expect(JOB, {'job_state': 'R',
  3267. 'Resource_List.mem': '3gb',
  3268. 'Resource_List.ncpus': 4,
  3269. 'Resource_List.select': newsel,
  3270. 'Resource_List.place': self.job1_place,
  3271. 'Resource_List.nodect': 2,
  3272. 'schedselect': newsel,
  3273. 'exec_host': new_exec_host,
  3274. 'exec_vnode': new_exec_vnode}, id=jid)
  3275. # Though the job is listed with ncpus=4 taking away released vnode
  3276. # <n5> (1 cpu), <n6> (1 cpu), <n7> (2 cpus),
  3277. # only <n7> got released. <n5> and <n6> are part of a super
  3278. # chunk that wasn't fully released.
  3279. self.license_count_match(6)
  3280. # Check account update ('u') record
  3281. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  3282. self.job1_exec_vnode_esc, "6gb", 8, 3,
  3283. self.job1_place,
  3284. self.job1_sel_esc)
  3285. # Check to make sure 'c' (next) record got generated
  3286. self.match_accounting_log('c', jid, new_exec_host_esc,
  3287. new_exec_vnode_esc, "3145728kb",
  3288. 4, 2, self.job1_place, newsel_esc)
  3289. # Check various vnode status.
  3290. jobs_assn1 = "%s/0" % (jid,)
  3291. # <n5> still job-busy
  3292. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3293. 'job-busy', jobs_assn1, 1, '1048576kb')
  3294. # <n6> still job-busy
  3295. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  3296. '0kb')
  3297. # <n7> now free
  3298. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  3299. 'free')
  3300. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  3301. 'resources_assigned.mem': '4194304kb'})
  3302. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  3303. 'resources_assigned.mem': '4194304kb'},
  3304. id="workq")
  3305. self.assertTrue(
  3306. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  3307. self.server.delete(jid)
  3308. # Check account phased end ('e') record
  3309. self.match_accounting_log('e', jid, new_exec_host_esc,
  3310. new_exec_vnode_esc,
  3311. "3145728kb", 4, 2,
  3312. self.job1_place,
  3313. newsel_esc)
  3314. # Check to make sure 'E' (end of job) record got generated
  3315. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  3316. self.job1_exec_vnode_esc, "6gb",
  3317. 8, 3, self.job1_place, self.job1_sel_esc)
  3318. def test_release_nodes5_extra(self):
  3319. """
  3320. Test:
  3321. Like test_release_nodes5 except instead of the super-chunk
  3322. and chunks getting only ncpus and mem values, additional
  3323. resources mpiprocs and ompthreads are also requested and
  3324. assigned:
  3325. For example:
  3326. qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
  3327. ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
  3328. ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  3329. We want to make sure the ompthreads and mpiprocs values are
  3330. preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
  3331. the host names are duplicated according to the number of
  3332. mpiprocs. For example, if <n1> is assigned to first
  3333. chunk, with mpiprocs=3, <n1> will appear 3 times in
  3334. $PBS_NODEFILE.
  3335. """
  3336. jid = self.create_and_submit_job('job1_extra_res')
  3337. self.server.expect(JOB, {'job_state': 'R',
  3338. 'Resource_List.mem': '6gb',
  3339. 'Resource_List.ncpus': 8,
  3340. 'Resource_List.nodect': 3,
  3341. 'Resource_List.select':
  3342. self.job1_extra_res_select,
  3343. 'Resource_List.place':
  3344. self.job1_extra_res_place,
  3345. 'schedselect':
  3346. self.job1_extra_res_schedselect,
  3347. 'exec_host':
  3348. self.job1_extra_res_exec_host,
  3349. 'exec_vnode':
  3350. self.job1_extra_res_exec_vnode}, id=jid)
  3351. # server's license_count used value matches job's 'ncpus' value.
  3352. self.license_count_match(8)
  3353. # Check various vnode status.
  3354. jobs_assn1 = "%s/0" % (jid,)
  3355. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3356. 'job-busy', jobs_assn1, 1, '1048576kb')
  3357. self.match_vnode_status([self.n3, self.n6],
  3358. 'job-busy', jobs_assn1, 1, '0kb')
  3359. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3360. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3361. 2, '2097152kb')
  3362. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3363. # inside pbs_nodefile_match_exec_host() function, takes care of
  3364. # verifying that the host names appear according to the number of
  3365. # mpiprocs assigned to the chunk.
  3366. self.assertTrue(
  3367. self.pbs_nodefile_match_exec_host(
  3368. jid, self.job1_extra_res_exec_host,
  3369. self.job1_extra_res_schedselect))
  3370. # Run pbs_release_nodes as root
  3371. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6,
  3372. self.n7]
  3373. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3374. sudo=True)
  3375. self.assertEqual(ret['rc'], 0)
  3376. # momB's host will not get job summary reported but
  3377. # momC's host will get the job summary since all vnodes
  3378. # from the host have been released.
  3379. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3380. jid, self.hostB), n=10, regexp=True, existence=False,
  3381. max_attempts=5, interval=1)
  3382. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3383. jid, self.hostC), n=10, regexp=True)
  3384. # momB's host will not get DELETE_JOB2 request since
  3385. # not all their vnodes have been released yet from the job.
  3386. # momC will get DELETE_JOB2 request since sole vnode
  3387. # <n7> has been released from the job.
  3388. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3389. existence=False, max_attempts=5, interval=1)
  3390. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  3391. # Ensure the 'fib' process is gone from hostC when DELETE_JOB request
  3392. # received
  3393. self.server.pu.get_proc_info(
  3394. self.momC.hostname, ".*fib.*", None, regexp=True)
  3395. self.assertEqual(len(self.server.pu.processes), 0)
  3396. # Verify remaining job resources.
  3397. sel_esc = self.job1_extra_res_select.replace("+", "\+")
  3398. exec_host_esc = self.job1_extra_res_exec_host.replace(
  3399. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3400. exec_vnode_esc = \
  3401. self.job1_extra_res_exec_vnode.replace("[", "\[").replace(
  3402. "]", "\]").replace("(", "\(").replace(")", "\)").replace(
  3403. "+", "\+")
  3404. newsel = \
  3405. "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
  3406. "1:mem=1048576kb:ncpus=1:mpiprocs=3:ompthreads=3"
  3407. newsel_esc = newsel.replace("+", "\+")
  3408. new_exec_host = self.job1_extra_res_exec_host.replace(
  3409. "+%s/0*2" % (self.n7,), "")
  3410. new_exec_host_esc = new_exec_host.replace(
  3411. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3412. new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
  3413. "+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
  3414. new_exec_vnode = new_exec_vnode.replace(
  3415. "+%s:ncpus=1" % (self.n6,), "")
  3416. new_exec_vnode = new_exec_vnode.replace(
  3417. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  3418. new_exec_vnode_esc = \
  3419. new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
  3420. "(", "\(").replace(")", "\)").replace("+", "\+")
  3421. self.server.expect(JOB, {'job_state': 'R',
  3422. 'Resource_List.mem': '3gb',
  3423. 'Resource_List.ncpus': 4,
  3424. 'Resource_List.select': newsel,
  3425. 'Resource_List.place':
  3426. self.job1_extra_res_place,
  3427. 'Resource_List.nodect': 2,
  3428. 'schedselect': newsel,
  3429. 'exec_host': new_exec_host,
  3430. 'exec_vnode': new_exec_vnode}, id=jid)
  3431. # Though the job is listed with ncpus=4 taking away released vnode
  3432. # <n5> (1 cpu), <n6> (1 cpu), <n7> (2 cpus),
  3433. # only <n7> got released. <n5> and <n6> are part of a super
  3434. # chunk that wasn't fully released.
  3435. self.license_count_match(6)
  3436. # Check account update ('u') record
  3437. self.match_accounting_log('u', jid, exec_host_esc,
  3438. exec_vnode_esc, "6gb", 8, 3,
  3439. self.job1_extra_res_place,
  3440. sel_esc)
  3441. # Check to make sure 'c' (next) record got generated
  3442. self.match_accounting_log('c', jid, new_exec_host_esc,
  3443. new_exec_vnode_esc, "3145728kb",
  3444. 4, 2, self.job1_extra_res_place, newsel_esc)
  3445. # Check various vnode status.
  3446. jobs_assn1 = "%s/0" % (jid,)
  3447. # <n5> still job-busy
  3448. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3449. 'job-busy', jobs_assn1, 1, '1048576kb')
  3450. # <n6> still job-busy
  3451. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  3452. '0kb')
  3453. # <n7> is now free
  3454. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  3455. 'free')
  3456. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  3457. 'resources_assigned.mem': '4194304kb'})
  3458. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  3459. 'resources_assigned.mem': '4194304kb'},
  3460. id="workq")
  3461. self.assertTrue(
  3462. self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
  3463. self.server.delete(jid)
  3464. # Check account phased end ('e') record
  3465. self.match_accounting_log('e', jid, new_exec_host_esc,
  3466. new_exec_vnode_esc,
  3467. "3145728kb", 4, 2,
  3468. self.job1_extra_res_place,
  3469. newsel_esc)
  3470. # Check to make sure 'E' (end of job) record got generated
  3471. self.match_accounting_log('E', jid, exec_host_esc,
  3472. exec_vnode_esc, "6gb",
  3473. 8, 3, self.job1_extra_res_place,
  3474. sel_esc)
  3475. def test_release_nodes6(self):
  3476. """
  3477. Test:
  3478. Given: a job that has been submitted with a select spec
  3479. of 2 super-chunks of ncpus=3 and mem=2gb each,
  3480. and 1 chunk of ncpus=2 and mem=2gb, along with
  3481. place spec of "scatter", resulting in an
  3482. exec_vnode=
  3483. (<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  3484. Executing pbs_release_nodes -j <job-id> <n4> <n5> <n6> <n7>
  3485. is equivalent to doing 'pbs_release_nodes -a' which
  3486. will have the same result as test_release_nodes_all.
  3487. That is, all sister nodes assigned to the job are
  3488. released early from the job.
  3489. """
  3490. jid = self.create_and_submit_job('job1_5')
  3491. self.server.expect(JOB, {'job_state': 'R',
  3492. 'Resource_List.mem': '6gb',
  3493. 'Resource_List.ncpus': 8,
  3494. 'Resource_List.nodect': 3,
  3495. 'Resource_List.select': self.job1_select,
  3496. 'Resource_List.place': self.job1_place,
  3497. 'schedselect': self.job1_schedselect,
  3498. 'exec_host': self.job1_exec_host,
  3499. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  3500. # server's license_count used value matches job's 'ncpus' value.
  3501. self.license_count_match(8)
  3502. # Check various vnode status.
  3503. jobs_assn1 = "%s/0" % (jid,)
  3504. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3505. 'job-busy', jobs_assn1, 1, '1048576kb')
  3506. self.match_vnode_status([self.n3, self.n6],
  3507. 'job-busy', jobs_assn1, 1, '0kb')
  3508. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3509. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3510. 2, '2097152kb')
  3511. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3512. self.assertTrue(
  3513. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  3514. # Run pbs_release_nodes as regular user
  3515. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  3516. self.n6, self.n7]
  3517. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3518. runas=TEST_USER)
  3519. self.assertEqual(ret['rc'], 0)
  3520. # Verify mom_logs
  3521. self.momA.log_match(
  3522. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  3523. max_attempts=18, interval=2, regexp=True)
  3524. self.momA.log_match(
  3525. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  3526. max_attempts=18, interval=2, regexp=True)
  3527. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3528. max_attempts=18, interval=2)
  3529. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3530. max_attempts=18, interval=2)
  3531. # Ensure the 'fib' process is gone when DELETE_JOB2 received on momB
  3532. self.server.pu.get_proc_info(
  3533. self.momB.hostname, ".*fib.*", None, regexp=True)
  3534. self.assertEqual(len(self.server.pu.processes), 0)
  3535. # Ensure the 'fib' process is gone when DELETE_JOB2 received on momC
  3536. self.server.pu.get_proc_info(
  3537. self.momC.hostname, ".*fib.*", None, regexp=True)
  3538. self.assertEqual(len(self.server.pu.processes), 0)
  3539. # Verify remaining job resources.
  3540. self.server.expect(JOB, {'job_state': 'R',
  3541. 'Resource_List.mem': '2gb',
  3542. 'Resource_List.ncpus': 3,
  3543. 'Resource_List.select': self.job1_newsel,
  3544. 'Resource_List.place': self.job1_place,
  3545. 'Resource_List.nodect': 1,
  3546. 'schedselect': self.job1_newsel,
  3547. 'exec_host': self.job1_new_exec_host,
  3548. 'exec_vnode': self.job1_new_exec_vnode},
  3549. id=jid)
  3550. # server's license_count used value matches job's 'ncpus' value.
  3551. self.license_count_match(3)
  3552. # Check various vnode status.
  3553. self.match_vnode_status([self.n1, self.n2],
  3554. 'job-busy', jobs_assn1, 1, '1048576kb')
  3555. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  3556. # nodes <n4>, <n5>, <n6>, <n7> are all free now
  3557. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  3558. self.n7, self.n8, self.n9, self.n10], 'free')
  3559. self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
  3560. 'resources_assigned.mem': '2097152kb'})
  3561. self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
  3562. 'resources_assigned.mem': '2097152kb'},
  3563. id="workq")
  3564. self.assertTrue(
  3565. self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
  3566. # Check account update ('u') record
  3567. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  3568. self.job1_exec_vnode_esc, "6gb", 8, 3,
  3569. self.job1_place,
  3570. self.job1_sel_esc)
  3571. # Check to make sure 'c' (next) record got generated
  3572. self.match_accounting_log('c', jid, self.job1_new_exec_host,
  3573. self.job1_new_exec_vnode_esc, "2097152kb",
  3574. 3, 1, self.job1_place, self.job1_newsel)
  3575. # For job to end to get the end records in the accounting_logs
  3576. self.server.delete(jid)
  3577. # Check account phased end job ('e') record
  3578. self.match_accounting_log('e', jid, self.job1_new_exec_host,
  3579. self.job1_new_exec_vnode_esc, "2097152kb", 3,
  3580. 1, self.job1_place, self.job1_newsel)
  3581. # Check account end of job ('E') record
  3582. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  3583. self.job1_exec_vnode_esc, "6gb", 8, 3,
  3584. self.job1_place,
  3585. self.job1_sel_esc)
  3586. def test_release_nodes6_extra(self):
  3587. """
  3588. Test:
  3589. Like test_release_nodes6 except instead of the super-chunk
  3590. and chunks getting only ncpus and mem values, additional
  3591. resources mpiprocs and ompthreads are also requested and
  3592. assigned:
  3593. For example:
  3594. qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
  3595. ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
  3596. ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  3597. We want to make sure the ompthreads and mpiprocs values are
  3598. preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
  3599. the host names are duplicated according to the number of
  3600. mpiprocs. For example, if <n1> is assigned to first
  3601. chunk, with mpiprocs=3, <n1> will appear 3 times in
  3602. $PBS_NODEFILE.
  3603. """
  3604. jid = self.create_and_submit_job('job1_extra_res')
  3605. self.server.expect(JOB, {'job_state': 'R',
  3606. 'Resource_List.mem': '6gb',
  3607. 'Resource_List.ncpus': 8,
  3608. 'Resource_List.nodect': 3,
  3609. 'Resource_List.select':
  3610. self.job1_extra_res_select,
  3611. 'Resource_List.place':
  3612. self.job1_extra_res_place,
  3613. 'schedselect':
  3614. self.job1_extra_res_schedselect,
  3615. 'exec_host': self.job1_extra_res_exec_host,
  3616. 'exec_vnode': self.job1_extra_res_exec_vnode},
  3617. id=jid)
  3618. # server's license_count used value matches job's 'ncpus' value.
  3619. self.license_count_match(8)
  3620. # Check various vnode status.
  3621. jobs_assn1 = "%s/0" % (jid,)
  3622. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3623. 'job-busy', jobs_assn1, 1, '1048576kb')
  3624. self.match_vnode_status([self.n3, self.n6],
  3625. 'job-busy', jobs_assn1, 1, '0kb')
  3626. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3627. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3628. 2, '2097152kb')
  3629. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3630. self.assertTrue(
  3631. self.pbs_nodefile_match_exec_host(jid,
  3632. self.job1_extra_res_exec_host,
  3633. self.job1_extra_res_schedselect))
  3634. # Run pbs_release_nodes as regular user
  3635. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  3636. self.n6, self.n7]
  3637. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3638. runas=TEST_USER)
  3639. self.assertEqual(ret['rc'], 0)
  3640. # Verify mom_logs
  3641. self.momA.log_match(
  3642. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  3643. max_attempts=18, interval=2, regexp=True)
  3644. self.momA.log_match(
  3645. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  3646. max_attempts=18, interval=2, regexp=True)
  3647. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3648. max_attempts=18, interval=2)
  3649. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3650. max_attempts=18, interval=2)
  3651. # Ensure the 'fib' process is gone when DELETE_JOB2 received on momB
  3652. self.server.pu.get_proc_info(
  3653. self.momB.hostname, ".*fib.*", None)
  3654. self.assertEqual(len(self.server.pu.processes), 0)
  3655. # Ensure the 'fib' process is gone when DELETE_JOB2 received on momC
  3656. self.server.pu.get_proc_info(
  3657. self.momC.hostname, ".*fib.*", None, regexp=True)
  3658. self.assertEqual(len(self.server.pu.processes), 0)
  3659. # Verify remaining job resources.
  3660. sel_esc = self.job1_extra_res_select.replace("+", "\+")
  3661. exec_host_esc = self.job1_extra_res_exec_host.replace(
  3662. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3663. exec_vnode_esc = \
  3664. self.job1_extra_res_exec_vnode.replace("[", "\[").replace(
  3665. "]", "\]").replace(
  3666. "(", "\(").replace(")", "\)").replace("+", "\+")
  3667. newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2"
  3668. newsel_esc = newsel.replace("+", "\+")
  3669. new_exec_host = self.job1_extra_res_exec_host.replace(
  3670. "+%s/0*2" % (self.n7,), "")
  3671. new_exec_host = new_exec_host.replace("+%s/0*0" % (self.n4,), "")
  3672. new_exec_host_esc = new_exec_host.replace(
  3673. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3674. new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
  3675. "+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
  3676. new_exec_vnode = new_exec_vnode.replace(
  3677. "+%s:ncpus=1" % (self.n6,), "")
  3678. new_exec_vnode = new_exec_vnode.replace(
  3679. "+(%s:mem=1048576kb:ncpus=1)" % (self.n4,), "")
  3680. new_exec_vnode = new_exec_vnode.replace(
  3681. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  3682. new_exec_vnode_esc = \
  3683. new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
  3684. "(", "\(").replace(")", "\)").replace("+", "\+")
  3685. self.server.expect(JOB,
  3686. {'job_state': 'R',
  3687. 'Resource_List.mem': '2gb',
  3688. 'Resource_List.ncpus': 3,
  3689. 'Resource_List.select': newsel,
  3690. 'Resource_List.place':
  3691. self.job1_extra_res_place,
  3692. 'Resource_List.nodect': 1,
  3693. 'schedselect': newsel,
  3694. 'exec_host': new_exec_host,
  3695. 'exec_vnode': new_exec_vnode}, id=jid)
  3696. # server's license_count used value matches job's 'ncpus' value.
  3697. self.license_count_match(3)
  3698. # Check various vnode status.
  3699. self.match_vnode_status([self.n1, self.n2],
  3700. 'job-busy', jobs_assn1, 1, '1048576kb')
  3701. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  3702. self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
  3703. 'resources_assigned.mem': '2097152kb'})
  3704. self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
  3705. 'resources_assigned.mem': '2097152kb'},
  3706. id="workq")
  3707. # nodes <n4>, <n5>, <n6>, <n7> are all free now
  3708. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  3709. self.n7, self.n8, self.n9, self.n10], 'free')
  3710. # Ensure the $PBS_NODEFILE contents account for the mpiprocs value;
  3711. # that is, each node hostname is listed 'mpiprocs' number of times in
  3712. # the file.
  3713. self.assertTrue(
  3714. self.pbs_nodefile_match_exec_host(
  3715. jid, self.job1_new_exec_host, newsel))
  3716. # Check account update ('u') record
  3717. self.match_accounting_log('u', jid, exec_host_esc,
  3718. exec_vnode_esc,
  3719. "6gb", 8, 3,
  3720. self.job1_extra_res_place,
  3721. sel_esc)
  3722. # Check to make sure 'c' (next) record got generated
  3723. self.match_accounting_log('c', jid, new_exec_host_esc,
  3724. self.job1_new_exec_vnode_esc, "2097152kb",
  3725. 3, 1, self.job1_place, newsel_esc)
  3726. # For job to end to get the end records in the accounting_logs
  3727. self.server.delete(jid)
  3728. # Check account phased end job ('e') record
  3729. self.match_accounting_log('e', jid, new_exec_host_esc,
  3730. new_exec_vnode_esc, "2097152kb", 3,
  3731. 1, self.job1_place, newsel_esc)
  3732. # Check account end of job ('E') record
  3733. self.match_accounting_log('E', jid, exec_host_esc,
  3734. exec_vnode_esc, "6gb", 8, 3,
  3735. self.job1_place, sel_esc)
  3736. # longer timeout needed as the following test takes a bit
  3737. # longer waiting for job to finish due to stage out
  3738. @timeout(400)
  3739. def test_release_nodes_cmd_plus_stageout(self):
  3740. """
  3741. Test:
  3742. This test calling pbs_release_nodes command on a job
  3743. submitted with release_nodes_on_stageout option.
  3744. Given a job submitted as:
  3745. qsub -W release_nodes_on_stageout=true job.script
  3746. where job.script specifies a select spec of
  3747. 2 super-chunks of ncpus=3 and mem=2gb each,
  3748. and 1 chunk of ncpus=2 and mem=2gb, along with
  3749. place spec of "scatter", resulting in an:
  3750. exec_vnode=(<n1>+<n2>+<n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  3751. Then issue:
  3752. pbs_release_nodes -j <job-id> <n7>
  3753. This would generate a 'u' and 'c' accounting record.
  3754. while <n7> vnode gets deallocated given that it's
  3755. the only vnode assigned to host mapped to third chunk.
  3756. Now call:
  3757. qdel <job-id>
  3758. This would cause the remaining vnodes <n4>, <n5>, <n6>
  3759. to be deallocated due to job have the
  3760. -W release_nodes_on_stageout=true setting.
  3761. The result is reflected in the 'u', 'c', and 'e'
  3762. accounting logs. 'E' accounting record summarizes
  3763. everything.
  3764. """
  3765. jid = self.create_and_submit_job('job1')
  3766. self.server.expect(JOB, {'job_state': 'R',
  3767. 'release_nodes_on_stageout': 'True',
  3768. 'Resource_List.mem': '6gb',
  3769. 'Resource_List.ncpus': 8,
  3770. 'Resource_List.nodect': 3,
  3771. 'Resource_List.select': self.job1_select,
  3772. 'Resource_List.place': self.job1_place,
  3773. 'schedselect': self.job1_schedselect,
  3774. 'exec_host': self.job1_exec_host,
  3775. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  3776. # server's license_count used value matches job's 'ncpus' value.
  3777. self.license_count_match(8)
  3778. # Check various vnode status.
  3779. jobs_assn1 = "%s/0" % (jid,)
  3780. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3781. 'job-busy', jobs_assn1, 1, '1048576kb')
  3782. self.match_vnode_status([self.n3, self.n6],
  3783. 'job-busy', jobs_assn1, 1, '0kb')
  3784. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3785. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3786. 2, '2097152kb')
  3787. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3788. # Run pbs_release_nodes
  3789. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n7]
  3790. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3791. sudo=True)
  3792. self.assertEqual(ret['rc'], 0)
  3793. # Only mom hostC will get the job summary it was released
  3794. # early courtesy of sole vnode <n7>.
  3795. self.momA.log_match(
  3796. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
  3797. regexp=True, existence=False, max_attempts=5, interval=1)
  3798. self.momA.log_match(
  3799. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  3800. regexp=True)
  3801. # Only mom hostC will gt the IM_DELETE_JOB2 request
  3802. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3803. existence=False, max_attempts=5, interval=1)
  3804. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  3805. # Ensure the 'fib' process is gone from hostC when DELETE_JOB request
  3806. # received
  3807. self.server.pu.get_proc_info(
  3808. self.momC.hostname, ".*fib.*", None, regexp=True)
  3809. self.assertEqual(len(self.server.pu.processes), 0)
  3810. # Verify remaining job resources.
  3811. sel_esc = self.job1_select.replace("+", "\+")
  3812. exec_host_esc = self.job1_exec_host.replace(
  3813. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3814. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  3815. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  3816. newsel = "1:mem=2097152kb:ncpus=3+1:mem=2097152kb:ncpus=3"
  3817. newsel_esc = newsel.replace("+", "\+")
  3818. new_exec_host = "%s/0*0+%s/0*0" % (self.n0, self.hostB)
  3819. new_exec_host_esc = new_exec_host.replace(
  3820. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3821. new_exec_vnode = self.job1_exec_vnode.replace(
  3822. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  3823. new_exec_vnode_esc = new_exec_vnode.replace(
  3824. "[", "\[").replace("]", "\]").replace(
  3825. "(", "\(").replace(")", "\)").replace("+", "\+")
  3826. self.server.expect(JOB, {'job_state': 'R',
  3827. 'Resource_List.mem': '4194304kb',
  3828. 'Resource_List.ncpus': 6,
  3829. 'Resource_List.select': newsel,
  3830. 'Resource_List.place': self.job1_place,
  3831. 'Resource_List.nodect': 2,
  3832. 'schedselect': newsel,
  3833. 'exec_host': new_exec_host,
  3834. 'exec_vnode': new_exec_vnode}, id=jid)
  3835. # server's license_count used value matches job's 'ncpus' value.
  3836. self.license_count_match(6)
  3837. # Check various vnode status.
  3838. jobs_assn1 = "%s/0" % (jid,)
  3839. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3840. 'job-busy', jobs_assn1, 1, '1048576kb')
  3841. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  3842. '0kb')
  3843. # <n7> now free
  3844. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  3845. 'free')
  3846. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  3847. 'resources_assigned.mem': '4194304kb'})
  3848. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  3849. 'resources_assigned.mem': '4194304kb'},
  3850. id="workq")
  3851. self.assertTrue(
  3852. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  3853. # Check account update ('u') record
  3854. self.match_accounting_log('u', jid, exec_host_esc,
  3855. exec_vnode_esc, "6gb", 8, 3,
  3856. self.job1_place,
  3857. sel_esc)
  3858. # Check to make sure 'c' (next) record got generated
  3859. self.match_accounting_log('c', jid, new_exec_host_esc,
  3860. new_exec_vnode_esc, "4194304kb",
  3861. 6, 2, self.job1_place, newsel_esc)
  3862. # Terminate the job
  3863. self.server.delete(jid)
  3864. # Check 'u' accounting record from release_nodes_on_stageout=true
  3865. self.match_accounting_log('u', jid, new_exec_host_esc,
  3866. new_exec_vnode_esc, "4194304kb", 6, 2,
  3867. self.job1_place,
  3868. newsel_esc)
  3869. # Verify mom_logs
  3870. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3871. jid, self.hostB), n=10,
  3872. max_attempts=18, interval=2, regexp=True)
  3873. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3874. max_attempts=18, interval=2)
  3875. # Verify remaining job resources.
  3876. sel_esc = self.job1_select.replace("+", "\+")
  3877. exec_host_esc = self.job1_exec_host.replace(
  3878. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3879. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  3880. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  3881. newsel = self.transform_select(self.job1_select.split('+')[0])
  3882. newsel_esc = newsel.replace("+", "\+")
  3883. new_exec_host = self.job1_exec_host.split('+')[0]
  3884. new_exec_host_esc = new_exec_host.replace(
  3885. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3886. new_exec_vnode = self.job1_exec_vnode.split(')')[0] + ')'
  3887. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  3888. "]", "\]").replace("(", "\(").replace(")", "\)").replace(
  3889. "+", "\+")
  3890. self.server.expect(JOB, {'job_state': 'E',
  3891. 'Resource_List.mem': '2gb',
  3892. 'Resource_List.ncpus': 3,
  3893. 'Resource_List.select': newsel,
  3894. 'Resource_List.place': self.job1_place,
  3895. 'Resource_List.nodect': 1,
  3896. 'schedselect': newsel,
  3897. 'exec_host': new_exec_host,
  3898. 'exec_vnode': new_exec_vnode}, id=jid)
  3899. # server's license_count used value matches job's 'ncpus' value.
  3900. self.license_count_match(3)
  3901. # Check various vnode status.
  3902. # only vnodes from mother superior (sef.hostA) are job-busy
  3903. jobs_assn1 = "%s/0" % (jid,)
  3904. self.match_vnode_status([self.n1, self.n2],
  3905. 'job-busy', jobs_assn1, 1, '1048576kb')
  3906. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  3907. self.match_vnode_status([self.n0, self.n4, self.n5,
  3908. self.n6, self.n7, self.n8, self.n9,
  3909. self.n10], 'free')
  3910. self.assertTrue(
  3911. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  3912. # Check 'c' accounting record from release_nodes_on_stageout=true
  3913. self.match_accounting_log('c', jid, new_exec_host_esc,
  3914. new_exec_vnode_esc, "2097152kb",
  3915. 3, 1, self.job1_place, newsel_esc)
  3916. # wait for job to finish
  3917. self.server.expect(JOB, 'queue', id=jid, op=UNSET, max_attempts=100,
  3918. interval=4, offset=15)
  3919. # Check 'e' record to release_nodes_on_stageout=true
  3920. self.match_accounting_log('e', jid, new_exec_host,
  3921. new_exec_vnode_esc, "2097152kb",
  3922. 3, 1, self.job1_place, newsel_esc)
  3923. # Check 'E' (end of job) record to release_nodes_on_stageout=true
  3924. self.match_accounting_log('E', jid, exec_host_esc,
  3925. exec_vnode_esc, "6gb", 8, 3,
  3926. self.job1_place,
  3927. self.job1_sel_esc)
  3928. def test_multi_release_nodes(self):
  3929. """
  3930. Test:
  3931. This tests several calls to pbs_release_nodes command for
  3932. the same job.
  3933. Given a job submitted with a select spec of
  3934. 2 super-chunks of ncpus=3 and mem=2gb each,
  3935. and 1 chunk of ncpus=2 and mem=2gb, along with
  3936. place spec of "scatter", resulting in an:
  3937. has exec_vnode=
  3938. (<n1>+<n2><n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  3939. First call:
  3940. pbs_release_nodes -j <job-id> <n4>
  3941. <n4> node no longer shows in job's exec_vnode,
  3942. but it will still show as job-busy
  3943. (not accept jobs) since the other 2 vnodes,
  3944. <n5> and <n6> from the host mapped to second
  3945. chunk are still assigned. The 'u' and 'c'
  3946. accounting records will reflect this.
  3947. Second call:
  3948. pbs_release_nodes -j <job-id> <n5> <n6> <n7>
  3949. Now since all vnodes assigned to the job from
  3950. host mapped to second chunk will show as free.
  3951. Again, the accounting 'u' and 'c' records would
  3952. reflect this fact.
  3953. """
  3954. jid = self.create_and_submit_job('job1')
  3955. self.server.expect(JOB, {'job_state': 'R',
  3956. 'Resource_List.mem': '6gb',
  3957. 'Resource_List.ncpus': 8,
  3958. 'Resource_List.nodect': 3,
  3959. 'Resource_List.select': self.job1_select,
  3960. 'Resource_List.place': self.job1_place,
  3961. 'schedselect': self.job1_schedselect,
  3962. 'exec_host': self.job1_exec_host,
  3963. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  3964. # server's license_count used value matches job's 'ncpus' value.
  3965. self.license_count_match(8)
  3966. # Check various vnode status.
  3967. jobs_assn1 = "%s/0" % (jid,)
  3968. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  3969. 'job-busy', jobs_assn1, 1, '1048576kb')
  3970. self.match_vnode_status([self.n3, self.n6],
  3971. 'job-busy', jobs_assn1, 1, '0kb')
  3972. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3973. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  3974. 2, '2097152kb')
  3975. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  3976. # Run pbs_release_nodes
  3977. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  3978. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  3979. sudo=True)
  3980. self.assertEqual(ret['rc'], 0)
  3981. # Verify mom_logs
  3982. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3983. jid, self.hostB), n=10,
  3984. regexp=True,
  3985. existence=False, max_attempts=5, interval=1)
  3986. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  3987. jid, self.hostC), n=10,
  3988. regexp=True,
  3989. existence=False, max_attempts=5, interval=1)
  3990. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3991. existence=False, max_attempts=5, interval=1)
  3992. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  3993. existence=False, max_attempts=5, interval=1)
  3994. # Verify remaining job resources.
  3995. sel_esc = self.job1_select.replace("+", "\+")
  3996. exec_host_esc = self.job1_exec_host.replace(
  3997. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  3998. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  3999. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  4000. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  4001. "1:ncpus=2:mem=2097152kb"
  4002. newsel_esc = newsel.replace("+", "\+")
  4003. new_exec_host = self.job1_exec_host
  4004. new_exec_host_esc = self.job1_exec_host.replace(
  4005. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4006. new_exec_vnode = self.job1_exec_vnode.replace(
  4007. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  4008. new_exec_vnode_esc = \
  4009. new_exec_vnode.replace("[", "\[").replace(
  4010. "]", "\]").replace("(", "\(").replace(
  4011. ")", "\)").replace("+", "\+")
  4012. self.server.expect(JOB, {'job_state': 'R',
  4013. 'Resource_List.mem': '5gb',
  4014. 'Resource_List.ncpus': 7,
  4015. 'Resource_List.select': newsel,
  4016. 'Resource_List.place': self.job1_place,
  4017. 'Resource_List.nodect': 3,
  4018. 'schedselect': newsel,
  4019. 'exec_host': new_exec_host,
  4020. 'exec_vnode': new_exec_vnode}, id=jid)
  4021. # Though the job is listed with ncpus=7 taking away released vnode
  4022. # <n4> (1 cpu), its license is not taken away as <n4> is assigned
  4023. # to a super chunk, and the parent mom still has not released the
  4024. # job as vnodes <n5> and <n6> are still allocated to the job.
  4025. self.license_count_match(8)
  4026. # Check various vnode status.
  4027. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4028. 'job-busy', jobs_assn1, 1, '1048576kb')
  4029. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  4030. '0kb')
  4031. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10],
  4032. 'free')
  4033. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  4034. 2, '2097152kb')
  4035. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  4036. 'resources_assigned.mem': '6291456kb'})
  4037. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  4038. 'resources_assigned.mem': '6291456kb'},
  4039. id="workq")
  4040. # Check account update ('u') record
  4041. self.match_accounting_log('u', jid, exec_host_esc,
  4042. exec_vnode_esc, "6gb", 8, 3,
  4043. self.job1_place,
  4044. sel_esc)
  4045. # Check to make sure 'c' (next) record got generated
  4046. self.match_accounting_log('c', jid, new_exec_host_esc,
  4047. new_exec_vnode_esc, "5242880kb",
  4048. 7, 3, self.job1_place, newsel_esc)
  4049. # Run pbs_release_nodes again
  4050. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5,
  4051. self.n6, self.n7]
  4052. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4053. sudo=True)
  4054. self.assertEqual(ret['rc'], 0)
  4055. # Now mom hostB and hostC can fully release the job
  4056. # resulting in job summary information reported
  4057. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4058. jid, self.hostB), n=10,
  4059. max_attempts=8, interval=2, regexp=True)
  4060. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4061. jid, self.hostC), n=10,
  4062. max_attempts=8, interval=2, regexp=True)
  4063. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4064. max_attempts=8, interval=2)
  4065. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4066. max_attempts=8, interval=2)
  4067. # Check account update ('u') record got generated
  4068. # second pbs_release_nodes call
  4069. self.match_accounting_log('u', jid, new_exec_host_esc,
  4070. new_exec_vnode_esc, "5242880kb", 7, 3,
  4071. self.job1_place,
  4072. newsel_esc)
  4073. # Verify remaining job resources.
  4074. newsel = "1:mem=2097152kb:ncpus=3"
  4075. newsel_esc = newsel.replace("+", "\+")
  4076. new_exec_host = new_exec_host.replace("+%s/0*2" % (self.n7,), "")
  4077. new_exec_host = new_exec_host.replace("+%s/0*0" % (self.n4,), "")
  4078. new_exec_host_esc = new_exec_host.replace(
  4079. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4080. new_exec_vnode = new_exec_vnode.replace(
  4081. "+(%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
  4082. new_exec_vnode = new_exec_vnode.replace(
  4083. "+%s:ncpus=1)" % (self.n6,), "")
  4084. new_exec_vnode = new_exec_vnode.replace(
  4085. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  4086. new_exec_vnode_esc = new_exec_vnode.replace(
  4087. "[", "\[").replace("]", "\]").replace(
  4088. "(", "\(").replace(")", "\)").replace("+", "\+")
  4089. self.server.expect(JOB, {'job_state': 'R',
  4090. 'Resource_List.mem': '2gb',
  4091. 'Resource_List.ncpus': 3,
  4092. 'Resource_List.select': newsel,
  4093. 'Resource_List.place': self.job1_place,
  4094. 'Resource_List.nodect': 1,
  4095. 'schedselect': newsel,
  4096. 'exec_host': new_exec_host,
  4097. 'exec_vnode': new_exec_vnode}, id=jid)
  4098. # server's license_count used value matches job's 'ncpus' value.
  4099. self.license_count_match(3)
  4100. # Check various vnode status.
  4101. jobs_assn1 = "%s/0" % (jid,)
  4102. self.match_vnode_status([self.n1, self.n2],
  4103. 'job-busy', jobs_assn1, 1, '1048576kb')
  4104. self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
  4105. self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
  4106. self.n7, self.n8, self.n9, self.n10],
  4107. 'free')
  4108. self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
  4109. 'resources_assigned.mem': '2097152kb'})
  4110. self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
  4111. 'resources_assigned.mem': '2097152kb'},
  4112. id="workq")
  4113. # Check to make sure 'c' (next) record got generated for
  4114. # second pbs_release_nodes call
  4115. self.match_accounting_log('c', jid, new_exec_host_esc,
  4116. new_exec_vnode_esc, "2097152kb",
  4117. 3, 1, self.job1_place, newsel_esc)
  4118. def test_release_nodes_run_next_job(self):
  4119. """
  4120. Test:
  4121. Test releasing nodes of one job to allow another
  4122. job to use resources from the released nodes.
  4123. Given a job submitted with a select spec of
  4124. 2 super-chunks of ncpus=3 and mem=2gb each,
  4125. and 1 chunk of ncpus=2 and mem=2gb, along with
  4126. place spec of "scatter", resulting in an:
  4127. exec_vnode=
  4128. (<n1>+<n2><n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  4129. First call:
  4130. pbs_release_nodes -j <job-id> <n4> <n5> <n7>
  4131. Submit another job: j2 that will need also the
  4132. unreleased vnode <n6> so job stays queued.
  4133. Now execute:
  4134. pbs_release_nodes -j <job-id> <n6>>
  4135. And job j2 starts executing using node <n6>
  4136. """
  4137. jid = self.create_and_submit_job('job1_5')
  4138. self.server.expect(JOB, {'job_state': 'R',
  4139. 'Resource_List.mem': '6gb',
  4140. 'Resource_List.ncpus': 8,
  4141. 'Resource_List.nodect': 3,
  4142. 'Resource_List.select': self.job1_select,
  4143. 'Resource_List.place': self.job1_place,
  4144. 'schedselect': self.job1_schedselect,
  4145. 'exec_host': self.job1_exec_host,
  4146. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  4147. # server's license_count used value matches job's 'ncpus' value.
  4148. self.license_count_match(8)
  4149. # Check various vnode status.
  4150. jobs_assn1 = "%s/0" % (jid,)
  4151. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4152. 'job-busy', jobs_assn1, 1, '1048576kb')
  4153. self.match_vnode_status([self.n3, self.n6],
  4154. 'job-busy', jobs_assn1, 1, '0kb')
  4155. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  4156. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  4157. 2, '2097152kb')
  4158. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4159. # Run pbs_release_nodes
  4160. cmd = [self.pbs_release_nodes_cmd, '-j', jid,
  4161. self.n4, self.n5, self.n7]
  4162. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4163. sudo=True)
  4164. self.assertEqual(ret['rc'], 0)
  4165. self.match_vnode_status([self.n3, self.n6],
  4166. 'job-busy', jobs_assn1, 1, '0kb')
  4167. # this is a 7-cpu job that needs <n6> which has not been freed
  4168. jid2 = self.create_and_submit_job('job2')
  4169. # we expect job_state to be Queued
  4170. self.server.expect(JOB, 'comment', op=SET, id=jid2)
  4171. self.server.expect(JOB, {'job_state': 'Q'}, id=jid2)
  4172. # Let's release the remaining <node6> vnode from hostB
  4173. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n6]
  4174. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4175. sudo=True)
  4176. self.assertEqual(ret['rc'], 0)
  4177. # now job 2 should start running
  4178. self.server.expect(JOB, {'job_state': 'R',
  4179. 'Resource_List.mem': '7gb',
  4180. 'Resource_List.ncpus': 7,
  4181. 'Resource_List.nodect': 3,
  4182. 'Resource_List.select': self.job2_select,
  4183. 'Resource_List.place': self.job2_place,
  4184. 'schedselect': self.job2_schedselect,
  4185. 'exec_host': self.job2_exec_host,
  4186. 'exec_vnode': self.job2_exec_vnode_var1},
  4187. id=jid2)
  4188. jobs_assn2 = "%s/0" % (jid2,)
  4189. self.match_vnode_status([self.n4, self.n5, self.n6, self.n8, self.n9],
  4190. 'job-busy', jobs_assn2, 1, '1048576kb')
  4191. jobs_assn3 = "%s/0, %s/1" % (jid2, jid2)
  4192. self.match_vnode_status([self.n7], 'job-busy', jobs_assn3,
  4193. 2, '2097152kb')
  4194. self.match_vnode_status([self.n0, self.n10], 'free')
  4195. def test_release_nodes_rerun(self):
  4196. """
  4197. Test:
  4198. Test the behavior of a job with released nodes when they
  4199. get rerun. The job is killed, requeued, and assigned
  4200. the original set of resources before pbs_release_nodes
  4201. was called.
  4202. Given a job submitted with a select spec of
  4203. 2 super-chunks of ncpus=3 and mem=2gb each,
  4204. and 1 chunk of ncpus=2 and mem=2gb, along with
  4205. place spec of "scatter", resulting in an:
  4206. exec_vnode=
  4207. (<n1>+<n2><n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  4208. First call:
  4209. pbs_release_nodes -j <job-id> <n5> <n6> <n7>
  4210. Then call:
  4211. qrerun <job-id>
  4212. Causes the job to rerun with the original requested
  4213. resources.
  4214. """
  4215. jid = self.create_and_submit_job('job1_5')
  4216. self.server.expect(JOB, {'job_state': 'R',
  4217. 'Resource_List.mem': '6gb',
  4218. 'Resource_List.ncpus': 8,
  4219. 'Resource_List.nodect': 3,
  4220. 'Resource_List.select': self.job1_select,
  4221. 'Resource_List.place': self.job1_place,
  4222. 'schedselect': self.job1_schedselect,
  4223. 'exec_host': self.job1_exec_host,
  4224. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  4225. # server's license_count used value matches job's 'ncpus' value.
  4226. self.license_count_match(8)
  4227. # Check various vnode status.
  4228. jobs_assn1 = "%s/0" % (jid,)
  4229. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4230. 'job-busy', jobs_assn1, 1, '1048576kb')
  4231. self.match_vnode_status([self.n3, self.n6],
  4232. 'job-busy', jobs_assn1, 1, '0kb')
  4233. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  4234. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  4235. 2, '2097152kb')
  4236. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4237. # Run pbs_release_nodes
  4238. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5,
  4239. self.n6, self.n7]
  4240. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4241. sudo=True)
  4242. self.assertEqual(ret['rc'], 0)
  4243. # only mom hostC released the job since the sole vnode
  4244. # <n7> has been released
  4245. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4246. jid, self.hostB), n=10, regexp=True,
  4247. existence=False, max_attempts=5, interval=1)
  4248. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4249. jid, self.hostC), n=10, regexp=True)
  4250. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4251. existence=False, max_attempts=5, interval=1)
  4252. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  4253. # Verify remaining job resources.
  4254. sel_esc = self.job1_select.replace("+", "\+")
  4255. exec_host_esc = self.job1_exec_host.replace(
  4256. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4257. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  4258. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  4259. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=1"
  4260. newsel_esc = newsel.replace("+", "\+")
  4261. new_exec_host = self.job1_exec_host.replace(
  4262. "+%s/0*2" % (self.n7,), "")
  4263. new_exec_host_esc = new_exec_host.replace(
  4264. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4265. new_exec_vnode = self.job1_exec_vnode.replace(
  4266. "+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
  4267. new_exec_vnode = new_exec_vnode.replace(
  4268. "+%s:ncpus=1" % (self.n6,), "")
  4269. new_exec_vnode = new_exec_vnode.replace(
  4270. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  4271. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  4272. "]", "\]").replace(
  4273. "(", "\(").replace(")", "\)").replace("+", "\+")
  4274. self.server.expect(JOB, {'job_state': 'R',
  4275. 'Resource_List.mem': '3gb',
  4276. 'Resource_List.ncpus': 4,
  4277. 'Resource_List.select': newsel,
  4278. 'Resource_List.place': self.job1_place,
  4279. 'Resource_List.nodect': 2,
  4280. 'schedselect': newsel,
  4281. 'exec_host': new_exec_host,
  4282. 'exec_vnode': new_exec_vnode}, id=jid)
  4283. # Though the job is listed with ncpus=4 taking away released vnode
  4284. # <n5> (1 cpu), <n6> (1 cpu), <n7> (2 cpus),
  4285. # only the 2 cpu licenses from chunk containing <n7> are released as
  4286. # <n5> and <n6> are from a super-chunk, and its parent mom is not
  4287. # releasing the job yet since vnode <n4> is still assigned to the job.
  4288. self.license_count_match(6)
  4289. # Check account update ('u') record
  4290. self.match_accounting_log('u', jid, exec_host_esc,
  4291. exec_vnode_esc, "6gb", 8, 3,
  4292. self.job1_place,
  4293. sel_esc)
  4294. # Check to make sure 'c' (next) record got generated
  4295. self.match_accounting_log('c', jid, new_exec_host_esc,
  4296. new_exec_vnode_esc, "3145728kb",
  4297. 4, 2, self.job1_place, newsel_esc)
  4298. # Check various vnode status.
  4299. jobs_assn1 = "%s/0" % (jid,)
  4300. self.match_vnode_status([self.n1, self.n2, self.n4],
  4301. 'job-busy', jobs_assn1, 1, '1048576kb')
  4302. self.match_vnode_status([self.n5], 'job-busy', jobs_assn1,
  4303. 1, '1048576kb')
  4304. self.match_vnode_status([self.n3, self.n6],
  4305. 'job-busy', jobs_assn1, 1, '0kb')
  4306. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  4307. 'free')
  4308. # Now rerun the job
  4309. self.server.rerunjob(jid)
  4310. self.server.expect(JOB, {'job_state': 'R',
  4311. 'Resource_List.mem': '6gb',
  4312. 'Resource_List.ncpus': 8,
  4313. 'Resource_List.nodect': 3,
  4314. 'Resource_List.select': self.job1_select,
  4315. 'Resource_List.place': self.job1_place,
  4316. 'schedselect': self.job1_schedselect,
  4317. 'exec_host': self.job1_exec_host,
  4318. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  4319. # server's license_count used value matches job's 'ncpus' value.
  4320. self.license_count_match(8)
  4321. # Check various vnode status.
  4322. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4323. 'job-busy', jobs_assn1, 1, '1048576kb')
  4324. self.match_vnode_status([self.n3, self.n6],
  4325. 'job-busy', jobs_assn1, 1, '0kb')
  4326. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  4327. 2, '2097152kb')
  4328. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4329. def test_release_nodes_epilogue(self):
  4330. """
  4331. Test:
  4332. Test to make sure when a job is removed from
  4333. a mom host when all vnodes from that host have
  4334. been released for the job, and run the epilogue hook.
  4335. """
  4336. # First, submit an epilogue hook:
  4337. hook_body = """
  4338. import pbs
  4339. pbs.logjobmsg(pbs.event().job.id, "epilogue hook executed")
  4340. """
  4341. a = {'event': 'execjob_epilogue', 'enabled': 'true'}
  4342. self.server.create_import_hook("epi", a, hook_body)
  4343. jid = self.create_and_submit_job('job1_5')
  4344. self.server.expect(JOB, {'job_state': 'R',
  4345. 'Resource_List.mem': '6gb',
  4346. 'Resource_List.ncpus': 8,
  4347. 'Resource_List.nodect': 3,
  4348. 'Resource_List.select': self.job1_select,
  4349. 'Resource_List.place': self.job1_place,
  4350. 'schedselect': self.job1_schedselect,
  4351. 'exec_host': self.job1_exec_host,
  4352. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  4353. # server's license_count used value matches job's 'ncpus' value.
  4354. self.license_count_match(8)
  4355. # Check various vnode status.
  4356. jobs_assn1 = "%s/0" % (jid,)
  4357. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4358. 'job-busy', jobs_assn1, 1, '1048576kb')
  4359. self.match_vnode_status([self.n3, self.n6],
  4360. 'job-busy', jobs_assn1, 1, '0kb')
  4361. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  4362. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  4363. 2, '2097152kb')
  4364. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4365. # Run pbs_release_nodes
  4366. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
  4367. self.n6, self.n7]
  4368. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4369. sudo=True)
  4370. self.assertEqual(ret['rc'], 0)
  4371. # Verify mom_logs
  4372. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  4373. self.momB.log_match("Job;%s;epilogue hook executed" % (jid,), n=20)
  4374. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4375. jid, self.hostB), n=10,
  4376. max_attempts=2, interval=5, regexp=True)
  4377. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4378. jid, self.hostC), n=10,
  4379. max_attempts=2, interval=5, regexp=True)
  4380. # Ensure the 'fib' process is gone when DELETE_JOB
  4381. self.server.pu.get_proc_info(
  4382. self.momB.hostname, ".*fib.*", None, regexp=True)
  4383. self.assertEqual(len(self.server.pu.processes), 0)
  4384. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4385. max_attempts=2, interval=5)
  4386. self.momC.log_match("Job;%s;epilogue hook executed" % (jid,), n=20,
  4387. max_attempts=2, interval=5)
  4388. # Ensure the 'fib' process is gone when DELETE_JOB
  4389. self.server.pu.get_proc_info(
  4390. self.momC.hostname, ".*fib.*", None, regexp=True)
  4391. self.assertEqual(len(self.server.pu.processes), 0)
  4392. def test_release_nodes_complex(self):
  4393. """
  4394. Test:
  4395. Test a complicated scenario involving
  4396. releasing nodes from a job that has been
  4397. submitted with exclusive placement
  4398. (-l place=scatter:excl), having one of the
  4399. parent moms of released vnodes being
  4400. stopped and continued, suspending and resuming
  4401. of jobs, and finally submitting a new job
  4402. requiring a non-exclusive access to a vnode.
  4403. Given a job submitted with a select spec of
  4404. 2 super-chunks of ncpus=3 and mem=2gb each,
  4405. and 1 chunk of ncpus=1 and mem=1gb, along with
  4406. place spec of "scatter:excl", resulting in an:
  4407. exec_vnode=
  4408. (<n1>+<n2><n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  4409. Then stop parent mom host of <n7> (kill -STOP), now issue:
  4410. pbs_release_nodes -j <job-id> <n4> <n5> <n7>
  4411. causing <n4>, <n5>, and <n7> to still be tied to the job
  4412. as there's still node <n6> tied to the job as part of mom
  4413. hostB, which satisfies the second super-chunk.
  4414. Node <n7> is still assigned to the job as parent
  4415. mom hostC has been stopped.
  4416. Submit another job (job2), needing the node <n7> and
  4417. 1 cpu but job ends up queued since first job is still
  4418. using <n7>.
  4419. Now Delete job2.
  4420. Now suspend the first job, and all resources_assigned to
  4421. the job's nodes are cleared.
  4422. Now resume the mom of <n7> (kill -CONT). This mom would
  4423. tell server to free up node <n7> as first job has
  4424. been completely removed from the node.
  4425. Now resume job1, and all resources_asssigned
  4426. of the job's nodes including <n4>, <n5> are shown
  4427. allocated, with resources in node <n7> freed.
  4428. Then submit a new 1-cpu job that specifically asks for
  4429. vnode <n7>, and job should run
  4430. taking vnode <n7>, but on pbsnodes listing,
  4431. notice that the vnode's state is still "free"
  4432. and using 1 cpu and 1gb of memory. It's because
  4433. there's still 1 cpu and 1 gb of memory left to
  4434. use in vnode <n7>.
  4435. """
  4436. jid = self.create_and_submit_job('job11x')
  4437. self.server.expect(JOB, {'job_state': 'R',
  4438. 'Resource_List.mem': '5gb',
  4439. 'Resource_List.ncpus': 7,
  4440. 'Resource_List.nodect': 3,
  4441. 'Resource_List.select': self.job11x_select,
  4442. 'Resource_List.place': self.job11x_place,
  4443. 'schedselect': self.job11x_schedselect,
  4444. 'exec_host': self.job11x_exec_host,
  4445. 'exec_vnode': self.job11x_exec_vnode}, id=jid)
  4446. # server's license_count used value matches job's 'ncpus' value.
  4447. self.license_count_match(7)
  4448. # Check various vnode status.
  4449. jobs_assn1 = "%s/0" % (jid,)
  4450. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5, self.n7],
  4451. 'job-exclusive', jobs_assn1, 1, '1048576kb')
  4452. self.match_vnode_status([self.n3, self.n6],
  4453. 'job-exclusive', jobs_assn1, 1, '0kb')
  4454. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4455. # temporarily suspend momC, prevents from operating on released nodes
  4456. self.momC.signal("-STOP")
  4457. # Run pbs_release_nodes on nodes belonging to momB and momC
  4458. cmd = [self.pbs_release_nodes_cmd, '-j', jid,
  4459. self.n4, self.n5, self.n7]
  4460. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4461. sudo=True)
  4462. self.assertEqual(ret['rc'], 0)
  4463. # mom hostB and mom hostC continue to hold on to the job
  4464. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4465. jid, self.hostB), n=10,
  4466. regexp=True,
  4467. existence=False, max_attempts=5, interval=1)
  4468. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4469. jid, self.hostC), n=10,
  4470. regexp=True,
  4471. existence=False, max_attempts=5, interval=1)
  4472. # since not all vnodes from momB have been freed from the job,
  4473. # DELETE_JOB2 request from MS is not sent
  4474. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4475. existence=False, max_attempts=5, interval=1)
  4476. # since node <n7> from mom hostC has not been freed from the job
  4477. # since mom is currently stopped, the DELETE_JOB2 request from
  4478. # MS is not sent
  4479. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4480. existence=False, max_attempts=5, interval=1)
  4481. # Verify remaining job resources.
  4482. sel_esc = self.job11x_select.replace("+", "\+")
  4483. exec_host_esc = self.job11x_exec_host.replace(
  4484. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4485. exec_vnode_esc = self.job11x_exec_vnode.replace("[", "\[").replace(
  4486. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  4487. newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
  4488. newsel_esc = newsel.replace("+", "\+")
  4489. new_exec_host = self.job11x_exec_host.replace(
  4490. "+%s/0" % (self.n7,), "")
  4491. new_exec_host_esc = new_exec_host.replace(
  4492. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4493. new_exec_vnode = self.job11x_exec_vnode.replace(
  4494. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  4495. new_exec_vnode = new_exec_vnode.replace(
  4496. "%s:mem=1048576kb:ncpus=1+" % (self.n5), "")
  4497. new_exec_vnode = new_exec_vnode.replace(
  4498. "+(%s:ncpus=1:mem=1048576kb)" % (self.n7,), "")
  4499. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  4500. "]", "\]").replace(
  4501. "(", "\(").replace(")", "\)").replace("+", "\+")
  4502. self.server.expect(JOB, {'job_state': 'R',
  4503. 'Resource_List.mem': '2gb',
  4504. 'Resource_List.ncpus': 4,
  4505. 'Resource_List.select': newsel,
  4506. 'Resource_List.place': self.job11x_place,
  4507. 'Resource_List.nodect': 2,
  4508. 'schedselect': newsel,
  4509. 'exec_host': new_exec_host,
  4510. 'exec_vnode': new_exec_vnode},
  4511. id=jid)
  4512. # Though the job is listed with ncpus=4 taking away released vnode
  4513. # <n4> (1 cpu), <n5> (1 cpu), <n7> (1 cpu),
  4514. # hostB hasn't released job because <n6> is still part of the job and
  4515. # <n7> hasn't been released because the mom is stopped.
  4516. self.license_count_match(7)
  4517. # Check various vnode status.
  4518. jobs_assn1 = "%s/0" % (jid,)
  4519. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5,
  4520. self.n7], 'job-exclusive', jobs_assn1, 1,
  4521. '1048576kb')
  4522. self.match_vnode_status([self.n3, self.n6],
  4523. 'job-exclusive', jobs_assn1, 1, '0kb')
  4524. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4525. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4526. 'resources_assigned.mem': '5242880kb'})
  4527. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4528. 'resources_assigned.mem': '5242880kb'},
  4529. id="workq")
  4530. # submit a new job needing node <n7>, which is still currently tied
  4531. # to the previous job.
  4532. jid2 = self.create_and_submit_job('job12')
  4533. # we expect job_state to be Queued as the previous job still has
  4534. # vnode managed by hostC assigned exclusively.
  4535. self.server.expect(JOB, 'comment', op=SET, id=jid2)
  4536. self.server.expect(JOB, {'job_state': 'Q'}, id=jid2)
  4537. self.server.delete(jid2)
  4538. # now suspend previous job
  4539. self.server.sigjob(jid, 'suspend')
  4540. a = {'job_state': 'S'}
  4541. self.server.expect(JOB, a, id=jid)
  4542. # server's license_count used is 0 since job is suspended.
  4543. self.license_count_match(0)
  4544. self.match_vnode_status([self.n0, self.n1, self.n2, self.n3,
  4545. self.n4, self.n5, self.n6, self.n7,
  4546. self.n8, self.n9, self.n10], 'free')
  4547. # check server's resources_assigned values
  4548. self.server.expect(SERVER, {'resources_assigned.ncpus': 0,
  4549. 'resources_assigned.mem': '0kb'})
  4550. self.server.expect(QUEUE, {'resources_assigned.ncpus': 0,
  4551. 'resources_assigned.mem': '0kb'},
  4552. id="workq")
  4553. # now resume previous job
  4554. self.server.sigjob(jid, 'resume')
  4555. self.server.expect(JOB, {'job_state': 'R',
  4556. 'Resource_List.mem': '2gb',
  4557. 'Resource_List.ncpus': 4,
  4558. 'Resource_List.select': newsel,
  4559. 'Resource_List.place': self.job11x_place,
  4560. 'Resource_List.nodect': 2,
  4561. 'schedselect': newsel,
  4562. 'exec_host': new_exec_host,
  4563. 'exec_vnode': new_exec_vnode},
  4564. id=jid)
  4565. # Since job was resumed, the license count goes back to the same
  4566. # number before job was suspended.
  4567. self.license_count_match(7)
  4568. # Check various vnode status.
  4569. jobs_assn1 = "%s/0" % (jid,)
  4570. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5,
  4571. self.n7], 'job-exclusive', jobs_assn1, 1,
  4572. '1048576kb')
  4573. self.match_vnode_status([self.n3, self.n6],
  4574. 'job-exclusive', jobs_assn1, 1, '0kb')
  4575. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4576. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4577. 'resources_assigned.mem': '5242880kb'})
  4578. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4579. 'resources_assigned.mem': '5242880kb'},
  4580. id="workq")
  4581. # resume momC
  4582. self.momC.signal("-CONT")
  4583. # With momC resumed, it now receives DELETE_JOB2 request from
  4584. # MS
  4585. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  4586. # with mom on hostC resumed from its stopped state, then the
  4587. # the released node <n7>'s 1 cpu license is finally freed
  4588. # bringing the license count value to = 7 (previous value) - 1
  4589. self.license_count_match(6)
  4590. # submit this 1 cpu job that requests specifically vnode <n7>
  4591. jid3 = self.create_and_submit_job('job12')
  4592. self.server.expect(JOB, {'job_state': 'R',
  4593. 'Resource_List.mem': '1gb',
  4594. 'Resource_List.ncpus': 1,
  4595. 'Resource_List.nodect': 1,
  4596. 'Resource_List.select': self.job12_select,
  4597. 'Resource_List.place': self.job12_place,
  4598. 'schedselect': self.job12_schedselect,
  4599. 'exec_host': self.job12_exec_host,
  4600. 'exec_vnode': self.job12_exec_vnode},
  4601. id=jid3, max_attempts=3)
  4602. # total license = 6 (previous value) + 1 for job 'jid3'
  4603. self.license_count_match(7)
  4604. # Check various vnode status.
  4605. jobs_assn1 = "%s/0" % (jid,)
  4606. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4607. 'job-exclusive', jobs_assn1, 1,
  4608. '1048576kb')
  4609. self.match_vnode_status([self.n3, self.n6],
  4610. 'job-exclusive', jobs_assn1, 1, '0kb')
  4611. # Node <n7> shows as free since job 'jid3' did not request
  4612. # exclusive access.
  4613. jobs_assn2 = "%s/0" % (jid3,)
  4614. self.match_vnode_status([self.n7], 'free', jobs_assn2,
  4615. 1, '1048576kb')
  4616. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4617. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4618. 'resources_assigned.mem': '5242880kb'})
  4619. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4620. 'resources_assigned.mem': '5242880kb'},
  4621. id="workq")
  4622. def test_release_nodes_excl_server_restart_quick(self):
  4623. """
  4624. Test:
  4625. Test having a job submitted with exclusive
  4626. placement (-l place=scatter:excl),
  4627. then release a node from it where parent
  4628. mom is stopped, before stopping the
  4629. server with qterm -t quick which
  4630. will leave the job running, and when
  4631. server is started in warm mode where
  4632. also previous job retains its state,
  4633. job continues to have previous node
  4634. assignment including the pending
  4635. released node.
  4636. Given a job submitted with a select spec of
  4637. 2 super-chunks of ncpus=3 and mem=2gb each,
  4638. and 1 chunk of ncpus=2 and mem=2gb, along with
  4639. place spec of "scatter:excl", resulting in an:
  4640. exec_vnode=
  4641. (<n1>+<n2><n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  4642. Then stop parent mom host of <n7> (kill -STOP), now issue:
  4643. pbs_release_nodes -j <job-id> <n4> <n5> <n7>
  4644. causing <n4>, <n5>, and <n7> to still be tied to the job
  4645. as there's still node <n6> tied to the job as part of mom
  4646. hostB, which satisfies the second super-chunk.
  4647. Node <n7> is still assigned to the job as parent
  4648. mom hostC has been stopped.
  4649. Do a qterm -t quick, which will leave the job
  4650. running.
  4651. Now start pbs_server in default warm mode where all
  4652. running jobs are retained in that state including
  4653. their node assignments.
  4654. The job is restored to the same nodes assignment
  4655. as before taking into account the released nodes.
  4656. Now resume the mom of <n7> (kill -CONT). This mom would
  4657. tell server to free up node <n7> as first job has
  4658. been completely removed from the node.
  4659. """
  4660. jid = self.create_and_submit_job('job11x')
  4661. self.server.expect(JOB, {'job_state': 'R',
  4662. 'Resource_List.mem': '5gb',
  4663. 'Resource_List.ncpus': 7,
  4664. 'Resource_List.nodect': 3,
  4665. 'Resource_List.select': self.job11x_select,
  4666. 'Resource_List.place': self.job11x_place,
  4667. 'schedselect': self.job11x_schedselect,
  4668. 'exec_host': self.job11x_exec_host,
  4669. 'exec_vnode': self.job11x_exec_vnode}, id=jid)
  4670. # server's license_count used value matches job's 'ncpus' value.
  4671. self.license_count_match(7)
  4672. # Check various vnode status.
  4673. jobs_assn1 = "%s/0" % (jid,)
  4674. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5, self.n7],
  4675. 'job-exclusive', jobs_assn1, 1, '1048576kb')
  4676. self.match_vnode_status([self.n3, self.n6],
  4677. 'job-exclusive', jobs_assn1, 1, '0kb')
  4678. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4679. # temporarily suspend momC, prevents from operating on released nodes
  4680. self.momC.signal("-STOP")
  4681. # Run pbs_release_nodes on nodes belonging to momB and momC
  4682. cmd = [self.pbs_release_nodes_cmd, '-j', jid,
  4683. self.n4, self.n5, self.n7]
  4684. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4685. sudo=True)
  4686. self.assertEqual(ret['rc'], 0)
  4687. # mom hostB and mom hostC continuue to hold on to the job
  4688. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4689. jid, self.hostB), n=10,
  4690. regexp=True,
  4691. existence=False, max_attempts=5, interval=1)
  4692. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4693. jid, self.hostC), n=10,
  4694. regexp=True,
  4695. existence=False, max_attempts=5, interval=1)
  4696. # since not all vnodes from momB have been freed from the job,
  4697. # DELETE_JOB2 request from MS is not sent
  4698. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4699. existence=False, max_attempts=5, interval=1)
  4700. # since node <n7> from mom hostC has not been freed from the job
  4701. # since mom is currently stopped, the DELETE_JOB2 request from
  4702. # MS is not sent
  4703. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4704. existence=False, max_attempts=5, interval=1)
  4705. # Verify remaining job resources.
  4706. sel_esc = self.job11x_select.replace("+", "\+")
  4707. exec_host_esc = self.job11x_exec_host.replace(
  4708. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4709. exec_vnode_esc = self.job11x_exec_vnode.replace("[", "\[").replace(
  4710. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  4711. newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
  4712. newsel_esc = newsel.replace("+", "\+")
  4713. new_exec_host = self.job11x_exec_host.replace(
  4714. "+%s/0" % (self.n7,), "")
  4715. new_exec_host_esc = new_exec_host.replace(
  4716. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4717. new_exec_vnode = self.job11x_exec_vnode.replace(
  4718. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  4719. new_exec_vnode = new_exec_vnode.replace(
  4720. "%s:mem=1048576kb:ncpus=1+" % (self.n5), "")
  4721. new_exec_vnode = new_exec_vnode.replace(
  4722. "+(%s:ncpus=1:mem=1048576kb)" % (self.n7,), "")
  4723. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  4724. "]", "\]").replace(
  4725. "(", "\(").replace(")", "\)").replace("+", "\+")
  4726. self.server.expect(JOB, {'job_state': 'R',
  4727. 'Resource_List.mem': '2gb',
  4728. 'Resource_List.ncpus': 4,
  4729. 'Resource_List.select': newsel,
  4730. 'Resource_List.place': self.job11x_place,
  4731. 'Resource_List.nodect': 2,
  4732. 'schedselect': newsel,
  4733. 'exec_host': new_exec_host,
  4734. 'exec_vnode': new_exec_vnode},
  4735. id=jid)
  4736. # Though the job is listed with ncpus=4 taking away released vnode
  4737. # <n4> (1 cpu), <n5> (1 cpu), <n7> (1 cpu),
  4738. # hostB hasn't released job because <n6> is still part of the job and
  4739. # <n7> hasn't been released because the mom is stopped.
  4740. self.license_count_match(7)
  4741. # Check various vnode status.
  4742. jobs_assn1 = "%s/0" % (jid,)
  4743. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5,
  4744. self.n7], 'job-exclusive', jobs_assn1, 1,
  4745. '1048576kb')
  4746. self.match_vnode_status([self.n3, self.n6],
  4747. 'job-exclusive', jobs_assn1, 1, '0kb')
  4748. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4749. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4750. 'resources_assigned.mem': '5242880kb'})
  4751. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4752. 'resources_assigned.mem': '5242880kb'},
  4753. id="workq")
  4754. # Stop and Start the server
  4755. om = self.server.get_op_mode()
  4756. self.server.set_op_mode(PTL_CLI)
  4757. self.server.qterm(manner="quick")
  4758. self.server.set_op_mode(om)
  4759. self.assertFalse(self.server.isUp())
  4760. self.server.start()
  4761. self.assertTrue(self.server.isUp())
  4762. # Job should have the same state as before
  4763. self.server.expect(JOB, {'job_state': 'R',
  4764. 'Resource_List.mem': '2gb',
  4765. 'Resource_List.ncpus': 4,
  4766. 'Resource_List.select': newsel,
  4767. 'Resource_List.place': self.job11x_place,
  4768. 'Resource_List.nodect': 2,
  4769. 'schedselect': newsel,
  4770. 'exec_host': new_exec_host,
  4771. 'exec_vnode': new_exec_vnode},
  4772. id=jid)
  4773. # Since job was resumed, the license count goes back to the same
  4774. # number before job was suspended.
  4775. self.logger.info("sleep for 10 secs while server relicense job")
  4776. time.sleep(10)
  4777. self.license_count_match(7)
  4778. # Check various vnode status.
  4779. jobs_assn1 = "%s/0" % (jid,)
  4780. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4781. 'job-exclusive', jobs_assn1, 1,
  4782. '1048576kb')
  4783. self.match_vnode_status([self.n3, self.n6],
  4784. 'job-exclusive', jobs_assn1, 1, '0kb')
  4785. # parent mom of <n7> is currently in stopped state
  4786. self.match_vnode_status([self.n7], 'state-unknown,down,job-exclusive',
  4787. jobs_assn1, 1, '1048576kb')
  4788. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4789. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4790. 'resources_assigned.mem': '5242880kb'})
  4791. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4792. 'resources_assigned.mem': '5242880kb'},
  4793. id="workq")
  4794. # resume momC
  4795. self.momC.signal("-CONT")
  4796. # With momC resumed, it now receives DELETE_JOB2 request from
  4797. # MS
  4798. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  4799. # with mom on hostC resumed from its stopped state, then the
  4800. # the released node <n7>'s 1 cpu license is finally freed
  4801. # bringing the license count value to = 7 (previous value) - 1
  4802. self.license_count_match(6)
  4803. def test_release_nodes_excl_server_restart_immed(self):
  4804. """
  4805. Test:
  4806. Test having a job submitted with exclusive
  4807. placement (-l place=scatter:excl),
  4808. then release a node from it where parent
  4809. mom is stopped, before stopping the
  4810. server with qterm -t immediate which
  4811. will requeue job completely, and when
  4812. server is started, job gets assigned
  4813. the vnodes from the original request
  4814. before the pbs_release_nodes call.
  4815. Given a job submitted with a select spec of
  4816. 2 super-chunks of ncpus=3 and mem=2gb each,
  4817. and 1 chunk of ncpus=2 and mem=2gb, along with
  4818. place spec of "scatter:excl", resulting in an:
  4819. exec_vnode=
  4820. (<n1>+<n2><n3>)+(<n4>+<n5>+<n6>)+(<n7>)
  4821. Then stop parent mom host of <n7> (kill -STOP), now issue:
  4822. pbs_release_nodes -j <job-id> <n4> <n5> <n7>
  4823. causing <n4>, <n5>, and <n7> to still be tied to the job
  4824. as there's still node <n6> tied to the job sd part of mom
  4825. hostB, which satisfies the second super-chunk.
  4826. Node <n7> is still assigned to the job as parent
  4827. mom hostC has been stopped.
  4828. Do a qterm -t immediate, which will requeue the
  4829. currently running job.
  4830. Now start pbs_server.
  4831. The job goes back to getting assigned to the
  4832. original resources, before pbs_releaes_nodes
  4833. was called.
  4834. """
  4835. jid = self.create_and_submit_job('job11x')
  4836. self.server.expect(JOB, {'job_state': 'R',
  4837. 'Resource_List.mem': '5gb',
  4838. 'Resource_List.ncpus': 7,
  4839. 'Resource_List.nodect': 3,
  4840. 'Resource_List.select': self.job11x_select,
  4841. 'Resource_List.place': self.job11x_place,
  4842. 'schedselect': self.job11x_schedselect,
  4843. 'exec_host': self.job11x_exec_host,
  4844. 'exec_vnode': self.job11x_exec_vnode}, id=jid)
  4845. # server's license_count used value matches job's 'ncpus' value.
  4846. self.license_count_match(7)
  4847. # Check various vnode status.
  4848. jobs_assn1 = "%s/0" % (jid,)
  4849. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5, self.n7],
  4850. 'job-exclusive', jobs_assn1, 1, '1048576kb')
  4851. self.match_vnode_status([self.n3, self.n6],
  4852. 'job-exclusive', jobs_assn1, 1, '0kb')
  4853. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4854. # temporarily suspend momC, prevents from operating on released nodes
  4855. self.momC.signal("-STOP")
  4856. # Run pbs_release_nodes on nodes belonging to momB and momC
  4857. cmd = [self.pbs_release_nodes_cmd, '-j', jid,
  4858. self.n4, self.n5, self.n7]
  4859. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4860. sudo=True)
  4861. self.assertEqual(ret['rc'], 0)
  4862. # mom hostB and mom hostC continuue to hold on to the job
  4863. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4864. jid, self.hostB), n=10,
  4865. regexp=True,
  4866. existence=False, max_attempts=5, interval=1)
  4867. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  4868. jid, self.hostC), n=10,
  4869. regexp=True,
  4870. existence=False, max_attempts=5, interval=1)
  4871. # since not all vnodes from momB have been freed from the job,
  4872. # DELETE_JOB2 request from MS is not sent
  4873. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4874. existence=False, max_attempts=5, interval=1)
  4875. # since node <n7> from mom hostC has not been freed from the job
  4876. # since mom is currently stopped, the DELETE_JOB2 request from
  4877. # MS is not sent
  4878. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  4879. existence=False, max_attempts=5, interval=1)
  4880. # Verify remaining job resources.
  4881. sel_esc = self.job11x_select.replace("+", "\+")
  4882. exec_host_esc = self.job11x_exec_host.replace(
  4883. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4884. exec_vnode_esc = self.job11x_exec_vnode.replace("[", "\[").replace(
  4885. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  4886. newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
  4887. newsel_esc = newsel.replace("+", "\+")
  4888. new_exec_host = self.job11x_exec_host.replace(
  4889. "+%s/0" % (self.n7,), "")
  4890. new_exec_host_esc = new_exec_host.replace(
  4891. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  4892. new_exec_vnode = self.job11x_exec_vnode.replace(
  4893. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  4894. new_exec_vnode = new_exec_vnode.replace(
  4895. "%s:mem=1048576kb:ncpus=1+" % (self.n5), "")
  4896. new_exec_vnode = new_exec_vnode.replace(
  4897. "+(%s:ncpus=1:mem=1048576kb)" % (self.n7,), "")
  4898. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  4899. "]", "\]").replace(
  4900. "(", "\(").replace(")", "\)").replace("+", "\+")
  4901. self.server.expect(JOB, {'job_state': 'R',
  4902. 'Resource_List.mem': '2gb',
  4903. 'Resource_List.ncpus': 4,
  4904. 'Resource_List.select': newsel,
  4905. 'Resource_List.place': self.job11x_place,
  4906. 'Resource_List.nodect': 2,
  4907. 'schedselect': newsel,
  4908. 'exec_host': new_exec_host,
  4909. 'exec_vnode': new_exec_vnode},
  4910. id=jid)
  4911. # Though the job is listed with ncpus=4 taking away released vnode
  4912. # <n4> (1 cpu), <n5> (1 cpu), <n7> (1 cpu),
  4913. # hostB hasn't released job because <n6> is still part of the job and
  4914. # <n7> hasn't been released because the mom is stopped.
  4915. self.license_count_match(7)
  4916. # Check various vnode status.
  4917. jobs_assn1 = "%s/0" % (jid,)
  4918. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5,
  4919. self.n7], 'job-exclusive', jobs_assn1, 1,
  4920. '1048576kb')
  4921. self.match_vnode_status([self.n3, self.n6],
  4922. 'job-exclusive', jobs_assn1, 1, '0kb')
  4923. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4924. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4925. 'resources_assigned.mem': '5242880kb'})
  4926. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4927. 'resources_assigned.mem': '5242880kb'},
  4928. id="workq")
  4929. # Stop and Start the server
  4930. om = self.server.get_op_mode()
  4931. self.server.set_op_mode(PTL_CLI)
  4932. self.server.qterm(manner="immediate")
  4933. self.server.set_op_mode(om)
  4934. self.assertFalse(self.server.isUp())
  4935. # resume momC, but this is a stale request (nothing happens)
  4936. # since server is down.
  4937. self.momC.signal("-CONT")
  4938. # start the server again
  4939. self.server.start()
  4940. self.assertTrue(self.server.isUp())
  4941. # make sure job is now running with assigned resources
  4942. # from the original request
  4943. self.server.expect(JOB, {'job_state': 'R',
  4944. 'Resource_List.mem': '5gb',
  4945. 'Resource_List.ncpus': 7,
  4946. 'Resource_List.nodect': 3,
  4947. 'Resource_List.select': self.job11x_select,
  4948. 'Resource_List.place': self.job11x_place,
  4949. 'schedselect': self.job11x_schedselect,
  4950. 'exec_host': self.job11x_exec_host,
  4951. 'exec_vnode': self.job11x_exec_vnode}, id=jid)
  4952. # Check various vnode status.
  4953. jobs_assn1 = "%s/0" % (jid,)
  4954. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5,
  4955. self.n7], 'job-exclusive', jobs_assn1, 1,
  4956. '1048576kb')
  4957. self.match_vnode_status([self.n3, self.n6],
  4958. 'job-exclusive', jobs_assn1, 1, '0kb')
  4959. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4960. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4961. 'resources_assigned.mem': '5242880kb'})
  4962. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4963. 'resources_assigned.mem': '5242880kb'},
  4964. id="workq")
  4965. def test_release_nodes_shared_server_restart_quick(self):
  4966. """
  4967. Test:
  4968. Like test_release_nodes_excl_server_restart_quick test
  4969. except the job submitted does not have exclusive
  4970. placement, simply -l place=scatter.
  4971. The results are the same, except the vnode states
  4972. are either "job-busy" or "free" when there are resources
  4973. still available to share.
  4974. """
  4975. jid = self.create_and_submit_job('job11')
  4976. self.server.expect(JOB, {'job_state': 'R',
  4977. 'Resource_List.mem': '5gb',
  4978. 'Resource_List.ncpus': 7,
  4979. 'Resource_List.nodect': 3,
  4980. 'Resource_List.select': self.job11_select,
  4981. 'Resource_List.place': self.job11_place,
  4982. 'schedselect': self.job11_schedselect,
  4983. 'exec_host': self.job11_exec_host,
  4984. 'exec_vnode': self.job11_exec_vnode}, id=jid)
  4985. # server's license_count used value matches job's 'ncpus' value.
  4986. self.license_count_match(7)
  4987. # Check various vnode status.
  4988. jobs_assn1 = "%s/0" % (jid,)
  4989. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  4990. 'job-busy', jobs_assn1, 1, '1048576kb')
  4991. # node <n7> is free since there's still 1 ncpus and 1 gb
  4992. # that can be shared with other jobs
  4993. self.match_vnode_status([self.n7], 'free', jobs_assn1, 1, '1048576kb')
  4994. self.match_vnode_status([self.n3, self.n6],
  4995. 'job-busy', jobs_assn1, 1, '0kb')
  4996. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  4997. # temporarily suspend momC, prevents from operating on released nodes
  4998. self.momC.signal("-STOP")
  4999. # Run pbs_release_nodes on nodes belonging to momB and momC
  5000. cmd = [self.pbs_release_nodes_cmd, '-j', jid,
  5001. self.n4, self.n5, self.n7]
  5002. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5003. sudo=True)
  5004. self.assertEqual(ret['rc'], 0)
  5005. # mom hostB and mom hostC continuue to hold on to the job
  5006. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5007. jid, self.hostB), n=10,
  5008. regexp=True,
  5009. existence=False, max_attempts=5, interval=1)
  5010. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5011. jid, self.hostC), n=10,
  5012. regexp=True,
  5013. existence=False, max_attempts=5, interval=1)
  5014. # since not all vnodes from momB have been freed from the job,
  5015. # DELETE_JOB2 request from MS is not sent
  5016. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  5017. existence=False, max_attempts=5, interval=1)
  5018. # since node <n7> from mom hostC has not been freed from the job
  5019. # since mom is currently stopped, the DELETE_JOB2 request from
  5020. # MS is not sent
  5021. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  5022. existence=False, max_attempts=5, interval=1)
  5023. # Verify remaining job resources.
  5024. sel_esc = self.job11_select.replace("+", "\+")
  5025. exec_host_esc = self.job11_exec_host.replace(
  5026. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5027. exec_vnode_esc = self.job11_exec_vnode.replace("[", "\[").replace(
  5028. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  5029. newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
  5030. newsel_esc = newsel.replace("+", "\+")
  5031. new_exec_host = self.job11_exec_host.replace(
  5032. "+%s/0" % (self.n7,), "")
  5033. new_exec_host_esc = new_exec_host.replace(
  5034. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5035. new_exec_vnode = self.job11_exec_vnode.replace(
  5036. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  5037. new_exec_vnode = new_exec_vnode.replace(
  5038. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  5039. new_exec_vnode = new_exec_vnode.replace(
  5040. "+(%s:ncpus=1:mem=1048576kb)" % (self.n7,), "")
  5041. new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
  5042. "]", "\]").replace(
  5043. "(", "\(").replace(")", "\)").replace("+", "\+")
  5044. self.server.expect(JOB, {'job_state': 'R',
  5045. 'Resource_List.mem': '2gb',
  5046. 'Resource_List.ncpus': 4,
  5047. 'Resource_List.select': newsel,
  5048. 'Resource_List.place': self.job11_place,
  5049. 'Resource_List.nodect': 2,
  5050. 'schedselect': newsel,
  5051. 'exec_host': new_exec_host,
  5052. 'exec_vnode': new_exec_vnode},
  5053. id=jid)
  5054. # Though the job is listed with ncpus=4 taking away released vnode
  5055. # <n4> (1 cpu), <n5> (1 cpu), <n7> (1 cpu),
  5056. # hostB hasn't released job because <n6> is still part of the job and
  5057. # <n7> hasn't been released because the mom is stopped.
  5058. self.license_count_match(7)
  5059. # Check various vnode status.
  5060. jobs_assn1 = "%s/0" % (jid,)
  5061. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5062. 'job-busy', jobs_assn1, 1,
  5063. '1048576kb')
  5064. self.match_vnode_status([self.n3, self.n6],
  5065. 'job-busy', jobs_assn1, 1, '0kb')
  5066. # node <n7> is free since there's still 1 ncpus and 1 gb
  5067. # that can be shared with other jobs
  5068. self.match_vnode_status([self.n7], 'free', jobs_assn1,
  5069. 1, '1048576kb')
  5070. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5071. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  5072. 'resources_assigned.mem': '5242880kb'})
  5073. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  5074. 'resources_assigned.mem': '5242880kb'},
  5075. id="workq")
  5076. # Stop and Start the server
  5077. om = self.server.get_op_mode()
  5078. self.server.set_op_mode(PTL_CLI)
  5079. self.server.qterm(manner="quick")
  5080. self.server.set_op_mode(om)
  5081. self.assertFalse(self.server.isUp())
  5082. self.server.start()
  5083. self.assertTrue(self.server.isUp())
  5084. # Job should have the same state as before
  5085. self.server.expect(JOB, {'job_state': 'R',
  5086. 'Resource_List.mem': '2gb',
  5087. 'Resource_List.ncpus': 4,
  5088. 'Resource_List.select': newsel,
  5089. 'Resource_List.place': self.job11_place,
  5090. 'Resource_List.nodect': 2,
  5091. 'schedselect': newsel,
  5092. 'exec_host': new_exec_host,
  5093. 'exec_vnode': new_exec_vnode},
  5094. id=jid)
  5095. # Since job was resumed, the license count goes back to the same
  5096. # number before job was suspended.
  5097. self.logger.info("sleep for 10 secs while server relicense job")
  5098. time.sleep(10)
  5099. self.license_count_match(7)
  5100. # Check various vnode status.
  5101. jobs_assn1 = "%s/0" % (jid,)
  5102. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5103. 'job-busy', jobs_assn1, 1,
  5104. '1048576kb')
  5105. self.match_vnode_status([self.n3, self.n6],
  5106. 'job-busy', jobs_assn1, 1, '0kb')
  5107. # parent mom of <n7> is currently in stopped state
  5108. self.match_vnode_status([self.n7], 'state-unknown,down', jobs_assn1,
  5109. 1, '1048576kb')
  5110. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5111. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  5112. 'resources_assigned.mem': '5242880kb'})
  5113. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  5114. 'resources_assigned.mem': '5242880kb'},
  5115. id="workq")
  5116. # resume momC
  5117. self.momC.signal("-CONT")
  5118. # With momC resumed, it now receives DELETE_JOB2 request from
  5119. # MS
  5120. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  5121. # with mom on hostC resumed from its stopped state, then the
  5122. # the released node <n7>'s 1 cpu license is finally freed
  5123. # bringing the license count value to = 7 (previous value) - 1
  5124. self.license_count_match(6)
  5125. def test_release_nodes_shared_server_restart_immed(self):
  5126. """
  5127. Test:
  5128. Like test_release_nodes_excl_server_restart_quick test
  5129. except the job submitted does not have exclusive
  5130. placement, simply -l place=scatter.
  5131. The results are the same, except the vnode states
  5132. are either "job-busy" or "free" when there are resources
  5133. still available to share.
  5134. """
  5135. jid = self.create_and_submit_job('job11')
  5136. self.server.expect(JOB, {'job_state': 'R',
  5137. 'Resource_List.mem': '5gb',
  5138. 'Resource_List.ncpus': 7,
  5139. 'Resource_List.nodect': 3,
  5140. 'Resource_List.select': self.job11_select,
  5141. 'Resource_List.place': self.job11_place,
  5142. 'schedselect': self.job11_schedselect,
  5143. 'exec_host': self.job11_exec_host,
  5144. 'exec_vnode': self.job11_exec_vnode}, id=jid)
  5145. # server's license_count used value matches job's 'ncpus' value.
  5146. self.license_count_match(7)
  5147. # Check various vnode status.
  5148. jobs_assn1 = "%s/0" % (jid,)
  5149. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5150. 'job-busy', jobs_assn1, 1, '1048576kb')
  5151. self.match_vnode_status([self.n3, self.n6],
  5152. 'job-busy', jobs_assn1, 1, '0kb')
  5153. # node <n7> still has resources (ncpus=1, mem=1gb) to share
  5154. self.match_vnode_status([self.n7], 'free', jobs_assn1,
  5155. 1, '1048576kb')
  5156. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5157. # temporarily suspend momC, prevents from operating on released nodes
  5158. self.momC.signal("-STOP")
  5159. # Run pbs_release_nodes on nodes belonging to momB and momC
  5160. cmd = [self.pbs_release_nodes_cmd, '-j', jid,
  5161. self.n4, self.n5, self.n7]
  5162. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5163. sudo=True)
  5164. self.assertEqual(ret['rc'], 0)
  5165. # mom hostB and mom hostC continuue to hold on to the job
  5166. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5167. jid, self.hostB), n=10,
  5168. regexp=True,
  5169. existence=False, max_attempts=5, interval=1)
  5170. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5171. jid, self.hostC), n=10,
  5172. regexp=True,
  5173. existence=False, max_attempts=5, interval=1)
  5174. # since not all vnodes from momB have been freed from the job,
  5175. # DELETE_JOB2 request from MS is not sent
  5176. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  5177. existence=False, max_attempts=5, interval=1)
  5178. # since node <n7> from mom hostC has not been freed from the job
  5179. # since mom is currently stopped, the DELETE_JOB2 request from
  5180. # MS is not sent
  5181. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  5182. existence=False, max_attempts=5, interval=1)
  5183. # Verify remaining job resources.
  5184. sel_esc = self.job11_select.replace("+", "\+")
  5185. exec_host_esc = self.job11_exec_host.replace(
  5186. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5187. exec_vnode_esc = self.job11_exec_vnode.replace("[", "\[").replace(
  5188. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  5189. newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
  5190. newsel_esc = newsel.replace("+", "\+")
  5191. new_exec_host = self.job11_exec_host.replace(
  5192. "+%s/0" % (self.n7,), "")
  5193. new_exec_host_esc = new_exec_host.replace(
  5194. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5195. new_exec_vnode = self.job11_exec_vnode.replace(
  5196. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  5197. new_exec_vnode = new_exec_vnode.replace(
  5198. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  5199. new_exec_vnode = new_exec_vnode.replace(
  5200. "+(%s:ncpus=1:mem=1048576kb)" % (self.n7,), "")
  5201. new_exec_vnode_esc = new_exec_vnode.replace(
  5202. "[", "\[").replace("]", "\]").replace(
  5203. "(", "\(").replace(")", "\)").replace("+", "\+")
  5204. # job's substate is 41 (PRERUN) since MS mom is stopped
  5205. self.server.expect(JOB, {'job_state': 'R',
  5206. 'Resource_List.mem': '2gb',
  5207. 'Resource_List.ncpus': 4,
  5208. 'Resource_List.select': newsel,
  5209. 'Resource_List.place': self.job11_place,
  5210. 'Resource_List.nodect': 2,
  5211. 'schedselect': newsel,
  5212. 'exec_host': new_exec_host,
  5213. 'exec_vnode': new_exec_vnode},
  5214. id=jid)
  5215. # Though the job is listed with ncpus=4 taking away released vnode
  5216. # <n4> (1 cpu), <n5> (1 cpu), <n7> (1 cpu),
  5217. # hostB hasn't released job because <n6> is still part of the job and
  5218. # <n7> hasn't been released because the mom is stopped.
  5219. self.license_count_match(7)
  5220. # Check various vnode status.
  5221. jobs_assn1 = "%s/0" % (jid,)
  5222. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5223. 'job-busy', jobs_assn1, 1,
  5224. '1048576kb')
  5225. self.match_vnode_status([self.n3, self.n6],
  5226. 'job-busy', jobs_assn1, 1, '0kb')
  5227. # node <n7> still has resources (ncpus=1, mem=1gb) to share
  5228. self.match_vnode_status([self.n7], 'free', jobs_assn1,
  5229. 1, '1048576kb')
  5230. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5231. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  5232. 'resources_assigned.mem': '5242880kb'})
  5233. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  5234. 'resources_assigned.mem': '5242880kb'},
  5235. id="workq")
  5236. # Stop and Start the server
  5237. om = self.server.get_op_mode()
  5238. self.server.set_op_mode(PTL_CLI)
  5239. self.server.qterm(manner="immediate")
  5240. self.server.set_op_mode(om)
  5241. self.assertFalse(self.server.isUp())
  5242. # resume momC, but this is a stale request (nothing happens)
  5243. # since server is down.
  5244. self.momC.signal("-CONT")
  5245. # start the server again
  5246. self.server.start()
  5247. self.assertTrue(self.server.isUp())
  5248. # make sure job is now running with assigned resources
  5249. # from the original request
  5250. self.server.expect(JOB, {'job_state': 'R',
  5251. 'Resource_List.mem': '5gb',
  5252. 'Resource_List.ncpus': 7,
  5253. 'Resource_List.nodect': 3,
  5254. 'Resource_List.select': self.job11_select,
  5255. 'Resource_List.place': self.job11_place,
  5256. 'schedselect': self.job11_schedselect,
  5257. 'exec_host': self.job11_exec_host,
  5258. 'exec_vnode': self.job11_exec_vnode}, id=jid)
  5259. # Check various vnode status.
  5260. jobs_assn1 = "%s/0" % (jid,)
  5261. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5262. 'job-busy', jobs_assn1, 1,
  5263. '1048576kb')
  5264. self.match_vnode_status([self.n3, self.n6],
  5265. 'job-busy', jobs_assn1, 1, '0kb')
  5266. # node <n7> still has resources (ncpus=1, mem=1gb) to share
  5267. self.server.expect(VNODE, {'state': 'free',
  5268. 'jobs': jobs_assn1,
  5269. 'resources_assigned.ncpus': 1,
  5270. 'resources_assigned.mem': '1048576kb'},
  5271. id=self.n7)
  5272. self.match_vnode_status([self.n7], 'free', jobs_assn1,
  5273. 1, '1048576kb')
  5274. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5275. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  5276. 'resources_assigned.mem': '5242880kb'})
  5277. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  5278. 'resources_assigned.mem': '5242880kb'},
  5279. id="workq")
  5280. def test_release_mgr_oper(self):
  5281. """
  5282. Test that nodes are getting released as manager and operator
  5283. """
  5284. jid = self.create_and_submit_job('job1_5')
  5285. self.server.expect(JOB, {'job_state': 'R',
  5286. 'Resource_List.mem': '6gb',
  5287. 'Resource_List.ncpus': 8,
  5288. 'Resource_List.nodect': 3,
  5289. 'Resource_List.select': self.job1_select,
  5290. 'Resource_List.place': self.job1_place,
  5291. 'schedselect': self.job1_schedselect,
  5292. 'exec_host': self.job1_exec_host,
  5293. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  5294. manager = str(MGR_USER) + '@*'
  5295. self.server.manager(MGR_CMD_SET, SERVER,
  5296. {'managers': (INCR, manager)},
  5297. sudo=True)
  5298. operator = str(OPER_USER) + '@*'
  5299. self.server.manager(MGR_CMD_SET, SERVER,
  5300. {'operators': (INCR, operator)},
  5301. sudo=True)
  5302. # Release hostC as manager
  5303. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n7]
  5304. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5305. runas=MGR_USER)
  5306. self.assertEqual(ret['rc'], 0)
  5307. # Only mom hostC will get the job summary it was released
  5308. # early courtesy of sole vnode <n7>.
  5309. self.momA.log_match(
  5310. "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
  5311. regexp=True)
  5312. # Only mom hostC will get the IM_DELETE_JOB2 request
  5313. self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
  5314. # Release vnodes from momB as operator
  5315. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6]
  5316. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5317. runas=OPER_USER)
  5318. self.assertEqual(ret['rc'], 0)
  5319. # momB's host will not get job summary reported
  5320. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5321. jid, self.hostB), n=10, regexp=True, max_attempts=5,
  5322. existence=False, interval=1)
  5323. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
  5324. max_attempts=5, existence=False, interval=1)
  5325. # Verify remaining job resources.
  5326. sel_esc = self.job1_select.replace("+", "\+")
  5327. exec_host_esc = self.job1_exec_host.replace(
  5328. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5329. exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
  5330. "]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
  5331. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=1"
  5332. newsel_esc = newsel.replace("+", "\+")
  5333. new_exec_host = self.job1_exec_host.replace(
  5334. "+%s/0*2" % (self.n7,), "")
  5335. new_exec_host_esc = new_exec_host.replace(
  5336. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5337. new_exec_vnode = self.job1_exec_vnode.replace(
  5338. "+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
  5339. new_exec_vnode = new_exec_vnode.replace(
  5340. "+%s:ncpus=1" % (self.n6,), "")
  5341. new_exec_vnode = new_exec_vnode.replace(
  5342. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  5343. new_exec_vnode_esc = \
  5344. new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
  5345. "(", "\(").replace(")", "\)").replace("+", "\+")
  5346. self.server.expect(JOB, {'job_state': 'R',
  5347. 'Resource_List.mem': '3gb',
  5348. 'Resource_List.ncpus': 4,
  5349. 'Resource_List.select': newsel,
  5350. 'Resource_List.place': self.job1_place,
  5351. 'Resource_List.nodect': 2,
  5352. 'schedselect': newsel,
  5353. 'exec_host': new_exec_host,
  5354. 'exec_vnode': new_exec_vnode}, id=jid)
  5355. # Though the job is listed with ncpus=4 taking away released vnode
  5356. # <n5> (1 cpu), <n6> (1 cpu), <n7> (2 cpus),
  5357. # only <n7> got released. <n5> and <n6> are part of a super
  5358. # chunk that wasn't fully released.
  5359. self.license_count_match(6)
  5360. # Check account update ('u') record
  5361. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  5362. self.job1_exec_vnode_esc, "6gb", 8, 3,
  5363. self.job1_place,
  5364. self.job1_sel_esc)
  5365. # Check to make sure 'c' (next) record got generated
  5366. self.match_accounting_log('c', jid, new_exec_host_esc,
  5367. new_exec_vnode_esc, "3145728kb",
  5368. 4, 2, self.job1_place, newsel_esc)
  5369. # Check various vnode status.
  5370. jobs_assn1 = "%s/0" % (jid,)
  5371. # <n5> still job-busy
  5372. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5373. 'job-busy', jobs_assn1, 1, '1048576kb')
  5374. # <n6> still job-busy
  5375. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  5376. '0kb')
  5377. # <n7> now free
  5378. self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
  5379. 'free')
  5380. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  5381. 'resources_assigned.mem': '4194304kb'})
  5382. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  5383. 'resources_assigned.mem': '4194304kb'},
  5384. id="workq")
  5385. self.assertTrue(
  5386. self.pbs_nodefile_match_exec_host(jid, new_exec_host))
  5387. self.server.delete(jid)
  5388. # Check account phased end ('e') record
  5389. self.match_accounting_log('e', jid, new_exec_host_esc,
  5390. new_exec_vnode_esc,
  5391. "3145728kb", 4, 2,
  5392. self.job1_place,
  5393. newsel_esc)
  5394. # Check to make sure 'E' (end of job) record got generated
  5395. self.match_accounting_log('E', jid, self.job1_exec_host_esc,
  5396. self.job1_exec_vnode_esc, "6gb",
  5397. 8, 3, self.job1_place, self.job1_sel_esc)
  5398. def test_release_job_array(self):
  5399. """
  5400. Release vnodes from a job array and subjob
  5401. """
  5402. jid = self.create_and_submit_job('jobA')
  5403. self.server.expect(JOB, {'job_state': 'B',
  5404. 'Resource_List.mem': '6gb',
  5405. 'Resource_List.ncpus': 8,
  5406. 'Resource_List.nodect': 3,
  5407. 'Resource_List.select': self.job1_select,
  5408. 'Resource_List.place': self.job1_place,
  5409. 'schedselect': self.job1_schedselect}, id=jid)
  5410. # Release nodes from job array. It will fail
  5411. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  5412. try:
  5413. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5414. sudo=True)
  5415. except PtlExceptions as e:
  5416. self.assertTrue("not supported for Array jobs" in e.msg)
  5417. self.assertFalse(e.rc)
  5418. # Verify the same for subjob1
  5419. subjob1 = jid.replace('[]', '[1]')
  5420. self.server.expect(JOB, {'job_state': 'R',
  5421. 'Resource_List.mem': '6gb',
  5422. 'Resource_List.ncpus': 8,
  5423. 'Resource_List.nodect': 3,
  5424. 'Resource_List.select': self.job1_select,
  5425. 'Resource_List.place': self.job1_place,
  5426. 'schedselect': self.job1_schedselect,
  5427. 'exec_host': self.job1_exec_host,
  5428. 'exec_vnode': self.job1_exec_vnode},
  5429. id=subjob1)
  5430. # Server's license_count used value matches job's 'ncpus' value.
  5431. self.license_count_match(8)
  5432. # Check various vnode status.
  5433. jobs_assn1 = "%s/0" % (subjob1,)
  5434. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5435. 'job-busy', jobs_assn1, 1, '1048576kb')
  5436. self.match_vnode_status([self.n3, self.n6],
  5437. 'job-busy', jobs_assn1, 1, '0kb')
  5438. jobs_assn2 = "%s/0, %s/1" % (subjob1, subjob1)
  5439. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  5440. 2, '2097152kb')
  5441. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5442. self.assertTrue(
  5443. self.pbs_nodefile_match_exec_host(subjob1, self.job1_exec_host))
  5444. # Run pbs_release_nodes as root
  5445. cmd = [self.pbs_release_nodes_cmd, '-j', subjob1, self.n4]
  5446. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5447. sudo=True)
  5448. self.assertEqual(ret['rc'], 0)
  5449. # Verify mom_logs
  5450. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5451. subjob1, self.hostB), n=10,
  5452. regexp=True,
  5453. max_attempts=5,
  5454. existence=False, interval=1)
  5455. self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
  5456. subjob1, self.hostC), n=10,
  5457. regexp=True, max_attempts=5,
  5458. existence=False, interval=1)
  5459. # momB's host will not get DELETE_JOB2 request since
  5460. # not all its vnodes have been released yet from the job.
  5461. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (subjob1,),
  5462. n=20, max_attempts=5,
  5463. existence=False, interval=1)
  5464. # Verify remaining job resources.
  5465. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  5466. "1:ncpus=2:mem=2097152kb"
  5467. newsel_esc = newsel.replace("+", "\+")
  5468. new_exec_host = self.job1_exec_host
  5469. # Below variable is being used for the accounting log match
  5470. # which is currently blocked on PTL bug PP-596.
  5471. # new_exec_host_esc = self.job1_exec_host.replace(
  5472. # "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  5473. new_exec_vnode = self.job1_exec_vnode.replace(
  5474. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  5475. new_exec_vnode_esc = new_exec_vnode.replace(
  5476. "[", "\[").replace("]", "\]").replace(
  5477. "(", "\(").replace(")", "\)").replace("+", "\+")
  5478. self.server.expect(JOB, {'job_state': 'R',
  5479. 'Resource_List.mem': '5gb',
  5480. 'Resource_List.ncpus': 7,
  5481. 'Resource_List.select': newsel,
  5482. 'Resource_List.place': self.job1_place,
  5483. 'Resource_List.nodect': 3,
  5484. 'schedselect': newsel,
  5485. 'exec_host': self.job1_exec_host,
  5486. 'exec_vnode': new_exec_vnode}, id=subjob1)
  5487. # Though the job is listed with ncpus=7 taking away released vnode
  5488. # <n4>, it's coming from a super-chunk where other vnodes <n5> and
  5489. # <n6> are still assigned to the job. So the parent mom of <n4>
  5490. # till won't release the job and thus, the 1 license for it is still
  5491. # allocated.
  5492. self.license_count_match(8)
  5493. # BELOW IS CODE IS BLOCEKD ON PP-596
  5494. # Check account update ('u') record
  5495. # self.match_accounting_log('u', subjob1, self.job1_exec_host_esc,
  5496. # self.job1_exec_vnode_esc, "6gb", 8, 3,
  5497. # self.job1_place,
  5498. # self.job1_sel_esc)
  5499. # Check to make sure 'c' (next) record got generated
  5500. # self.match_accounting_log('c', subjob1, self.job1_exec_host_esc,
  5501. # new_exec_vnode_esc, "5242880kb",
  5502. # 7, 3, self.job1_place, newsel_esc)
  5503. # Check various vnode status.
  5504. jobs_assn1 = "%s/0" % (subjob1,)
  5505. self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
  5506. 'job-busy', jobs_assn1, 1, '1048576kb')
  5507. self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
  5508. '0kb')
  5509. jobs_assn2 = "%s/0, %s/1" % (subjob1, subjob1)
  5510. self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
  5511. 2, '2097152kb')
  5512. self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
  5513. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  5514. 'resources_assigned.mem': '6291456kb'})
  5515. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  5516. 'resources_assigned.mem': '6291456kb'},
  5517. id="workq")
  5518. self.assertTrue(
  5519. self.pbs_nodefile_match_exec_host(subjob1, new_exec_host))
  5520. self.server.delete(subjob1)
  5521. # Check account phased end ('e') record
  5522. # self.match_accounting_log('e', subjob1, new_exec_host_esc,
  5523. # new_exec_vnode_esc,
  5524. # "5242880kb", 7, 3,
  5525. # self.job1_place,
  5526. # newsel_esc)
  5527. # Check to make sure 'E' (end of job) record got generated
  5528. # self.match_accounting_log('E', subjob1, self.job1_exec_host_esc,
  5529. # self.job1_exec_vnode_esc, "6gb",
  5530. # 8, 3, self.job1_place, self.job1_sel_esc)
  5531. def test_release_job_states(self):
  5532. """
  5533. Release nodes on jobs in various states; Q, H, S, W
  5534. """
  5535. # Submit a regular job that cannot run
  5536. a = {'Resource_List.ncpus': 100}
  5537. j = Job(TEST_USER, a)
  5538. jid = self.server.submit(j)
  5539. self.server.expect(JOB, {'job_state': 'Q'}, id=jid)
  5540. # Release nodes from a queued job
  5541. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  5542. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5543. sudo=True)
  5544. self.assertNotEqual(ret['rc'], 0)
  5545. self.server.delete(jid, wait=True)
  5546. # Submit a held job and try releasing the node
  5547. j1 = Job(TEST_USER, {ATTR_h: None})
  5548. jid1 = self.server.submit(j1)
  5549. self.server.expect(JOB, {'job_state': 'H'}, id=jid1)
  5550. cmd = [self.pbs_release_nodes_cmd, '-j', jid1, self.n4]
  5551. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5552. sudo=True)
  5553. self.assertNotEqual(ret['rc'], 0)
  5554. self.server.delete(jid1, wait=True)
  5555. # Submit a job in W state and try releasing the node
  5556. mydate = int(time.time()) + 120
  5557. mytime = convert_time('%m%d%H%M', str(mydate))
  5558. j2 = Job(TEST_USER, {ATTR_a: mytime})
  5559. jid2 = self.server.submit(j2)
  5560. self.server.expect(JOB, {'job_state': 'W'}, id=jid2)
  5561. cmd = [self.pbs_release_nodes_cmd, '-j', jid2, self.n4]
  5562. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5563. sudo=True)
  5564. self.assertNotEqual(ret['rc'], 0)
  5565. self.server.delete(jid2, wait=True)
  5566. def test_release_finishjob(self):
  5567. """
  5568. Test that releasing vnodes on finished jobs will fail
  5569. also verify the updated schedselect on a finished job
  5570. """
  5571. self.server.manager(MGR_CMD_SET, SERVER,
  5572. {'job_history_enable': "true"}, sudo=True)
  5573. jid = self.create_and_submit_job('job1_5')
  5574. self.server.expect(JOB, {'job_state': "R"}, id=jid)
  5575. # Release hostC
  5576. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n7]
  5577. ret = self.server.du.run_cmd(self.server.hostname, cmd)
  5578. self.assertEqual(ret['rc'], 0)
  5579. # Submit another job and make sure it is
  5580. # picked up by hostC
  5581. j = Job(TEST_USER,
  5582. {'Resource_List.select': "1:host=" + self.hostC})
  5583. jid2 = self.server.submit(j)
  5584. ehost = self.hostC + "/1"
  5585. self.server.expect(JOB, {'job_state': "R",
  5586. "exec_host": ehost}, id=jid2)
  5587. self.server.delete(jid, wait=True)
  5588. # Release vnode4 from a finished job. It will throw error.
  5589. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  5590. ret = self.server.du.run_cmd(self.server.hostname, cmd)
  5591. self.assertNotEqual(ret['rc'], 0)
  5592. # Verify the schedselect for a finished job
  5593. newsel = "1:mem=2097152kb:ncpus=3+1:mem=2097152kb:ncpus=3"
  5594. new_exec_host = "%s/0*0+%s/0*0" % (self.n0, self.hostB)
  5595. new_exec_vnode = self.job1_exec_vnode.replace(
  5596. "+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
  5597. self.server.expect(JOB, {'job_state': 'F',
  5598. 'Resource_List.mem': '4194304kb',
  5599. 'Resource_List.ncpus': 6,
  5600. 'Resource_List.select': newsel,
  5601. 'Resource_List.place': self.job1_place,
  5602. 'Resource_List.nodect': 2,
  5603. 'schedselect': newsel,
  5604. 'exec_host': new_exec_host,
  5605. 'exec_vnode': new_exec_vnode},
  5606. extend='x', id=jid)
  5607. def test_release_suspendjob(self):
  5608. """
  5609. Test that releasing nodes on suspended job will also
  5610. fail and schedselect will not change
  5611. """
  5612. jid = self.create_and_submit_job('job1_5')
  5613. self.server.expect(JOB, {'job_state': 'R',
  5614. 'Resource_List.mem': '6gb',
  5615. 'Resource_List.ncpus': 8,
  5616. 'Resource_List.nodect': 3,
  5617. 'Resource_List.select': self.job1_select,
  5618. 'Resource_List.place': self.job1_place,
  5619. 'schedselect': self.job1_schedselect,
  5620. 'exec_host': self.job1_exec_host,
  5621. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  5622. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
  5623. ret = self.server.du.run_cmd(self.server.hostname, cmd)
  5624. self.assertEqual(ret['rc'], 0)
  5625. # Verify remaining job resources
  5626. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  5627. "1:ncpus=2:mem=2097152kb"
  5628. new_exec_vnode = self.job1_exec_vnode.replace(
  5629. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  5630. self.server.expect(JOB, {'job_state': 'R',
  5631. 'Resource_List.mem': '5gb',
  5632. 'Resource_List.ncpus': 7,
  5633. 'Resource_List.select': newsel,
  5634. 'Resource_List.place': self.job1_place,
  5635. 'Resource_List.nodect': 3,
  5636. 'schedselect': newsel,
  5637. 'exec_host': self.job1_exec_host,
  5638. 'exec_vnode': new_exec_vnode}, id=jid)
  5639. # Suspend the job with qsig
  5640. self.server.sigjob(jid, 'suspend', runas=ROOT_USER)
  5641. self.server.expect(JOB, {'job_state': 'S'}, id=jid)
  5642. # Try releasing a node from suspended job
  5643. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n7]
  5644. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5645. sudo=True)
  5646. self.assertNotEqual(ret['rc'], 0)
  5647. # Verify that resources won't change
  5648. self.server.expect(JOB, {'job_state': 'S',
  5649. 'Resource_List.mem': '5gb',
  5650. 'Resource_List.ncpus': 7,
  5651. 'Resource_List.select': newsel,
  5652. 'Resource_List.place': self.job1_place,
  5653. 'Resource_List.nodect': 3,
  5654. 'schedselect': newsel,
  5655. 'exec_host': self.job1_exec_host,
  5656. 'exec_vnode': new_exec_vnode}, id=jid)
  5657. # Release the job and make sure it is running
  5658. self.server.sigjob(jid, 'resume')
  5659. self.server.expect(JOB, {'job_state': 'R',
  5660. 'Resource_List.mem': '5gb',
  5661. 'Resource_List.ncpus': 7,
  5662. 'Resource_List.select': newsel,
  5663. 'Resource_List.place': self.job1_place,
  5664. 'Resource_List.nodect': 3,
  5665. 'schedselect': newsel,
  5666. 'exec_host': self.job1_exec_host,
  5667. 'exec_vnode': new_exec_vnode}, id=jid)
  5668. @timeout(500)
  5669. def test_release_multi_jobs(self):
  5670. """
  5671. Release vnodes when multiple jobs are present
  5672. """
  5673. # Delete the vnodes and recreate them
  5674. self.momA.delete_vnode_defs()
  5675. self.momB.delete_vnode_defs()
  5676. self.momA.restart()
  5677. self.momB.restart()
  5678. self.server.manager(MGR_CMD_DELETE, NODE, None, "")
  5679. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA)
  5680. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB)
  5681. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC)
  5682. self.server.manager(MGR_CMD_SET, NODE,
  5683. {'resources_available.ncpus': 3},
  5684. id=self.hostA)
  5685. self.server.manager(MGR_CMD_SET, NODE,
  5686. {'resources_available.ncpus': 3},
  5687. id=self.hostB)
  5688. self.server.manager(MGR_CMD_SET, NODE,
  5689. {'resources_available.ncpus': 3},
  5690. id=self.hostC)
  5691. # Submit multiple jobs
  5692. jid1 = self.create_and_submit_job('job13')
  5693. jid2 = self.create_and_submit_job('job13')
  5694. jid3 = self.create_and_submit_job('job13')
  5695. e_host_j1 = self.hostA + "/0+" + self.hostB + "/0+" + self.hostC + "/0"
  5696. e_host_j2 = self.hostA + "/1+" + self.hostB + "/1+" + self.hostC + "/1"
  5697. e_host_j3 = self.hostA + "/2+" + self.hostB + "/2+" + self.hostC + "/2"
  5698. e_vnode = "(%s:ncpus=1)+(%s:ncpus=1)+(%s:ncpus=1)" \
  5699. % (self.hostA, self.hostB, self.hostC)
  5700. self.server.expect(JOB, {"job_state=R": 3})
  5701. self.server.expect(JOB, {"exec_host": e_host_j1,
  5702. "exec_vnode": e_vnode}, id=jid1)
  5703. self.server.expect(JOB, {"exec_host": e_host_j2,
  5704. "exec_vnode": e_vnode}, id=jid2)
  5705. self.server.expect(JOB, {"exec_host": e_host_j3,
  5706. "exec_vnode": e_vnode}, id=jid3)
  5707. # Verify that 3 processes running on hostB
  5708. n = retry = 5
  5709. for _ in range(n):
  5710. process = 0
  5711. self.server.pu.get_proc_info(
  5712. self.momB.hostname, ".*fib.*", None, regexp=True)
  5713. if (self.server.pu.processes is not None):
  5714. for key in self.server.pu.processes:
  5715. if ("fib" in key):
  5716. process = len(self.server.pu.processes[key])
  5717. self.logger.info(
  5718. "length of the process is " + str(process) +
  5719. ", expected 3")
  5720. if process == 3:
  5721. break
  5722. retry -= 1
  5723. if retry == 0:
  5724. raise AssertionError("not found 3 fib processes")
  5725. self.logger.info("sleeping 3 secs before next retry")
  5726. time.sleep(3)
  5727. # Release node2 from job1 only
  5728. cmd = [self.pbs_release_nodes_cmd, '-j', jid1, self.hostB]
  5729. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  5730. runas=TEST_USER)
  5731. self.assertEqual(ret['rc'], 0)
  5732. self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid1,),
  5733. max_attempts=18, interval=2)
  5734. # Verify that only 2 process left on hostB now
  5735. process = 0
  5736. self.server.pu.get_proc_info(
  5737. self.momB.hostname, ".*fib.*", None, regexp=True)
  5738. if (self.server.pu.processes is not None):
  5739. for key in self.server.pu.processes:
  5740. if ("fib" in key):
  5741. process = len(self.server.pu.processes[key])
  5742. print "length of the process is %d " % (process,)
  5743. self.assertEqual(process, 2)
  5744. # Mom logs only have message for job1 for node3
  5745. self.momA.log_match(
  5746. "Job;%s;%s.+cput=.+mem.+" % (jid1, self.hostB),
  5747. max_attempts=18, interval=2, regexp=True)
  5748. self.momA.log_match(
  5749. "Job;%s;%s.+cput=.+mem.+" % (jid2, self.hostB),
  5750. max_attempts=5, regexp=True,
  5751. existence=False, interval=1)
  5752. self.momA.log_match(
  5753. "Job;%s;%s.+cput=.+mem.+" % (jid3, self.hostB),
  5754. max_attempts=5, regexp=True,
  5755. existence=False, interval=1)
  5756. # Verify the new schedselect for job1
  5757. new_e_host_j1 = e_host_j1.replace("+%s/0" % (self.hostB,), "")
  5758. new_e_vnode = e_vnode.replace("+(%s:ncpus=1)" % (self.hostB,), "")
  5759. self.server.expect(JOB, {'job_state': "R",
  5760. "exec_host": new_e_host_j1,
  5761. "exec_vnode": new_e_vnode,
  5762. "schedselect": "1:ncpus=1+1:ncpus=1",
  5763. "Resource_List.ncpus": 2,
  5764. "Resource_List.nodect": 2}, id=jid1)
  5765. # Verify that host and vnode won't change for job2 and job3
  5766. self.server.expect(JOB, {'job_state': "R",
  5767. "exec_host": e_host_j2,
  5768. "exec_vnode": e_vnode,
  5769. "Resource_List.nodect": 3}, id=jid2)
  5770. self.server.expect(JOB, {'job_state': 'R',
  5771. "exec_host": e_host_j3,
  5772. "exec_vnode": e_vnode,
  5773. "Resource_List.nodect": 3}, id=jid3)
  5774. def test_PBS_JOBID(self):
  5775. """
  5776. Test that if -j jobid is not provided then it is
  5777. picked by env variable $PBS_JOBID in job script
  5778. """
  5779. # This one has a job script that calls 'pbs_release_nodes'
  5780. # (no jobid specified)
  5781. jid = self.create_and_submit_job('job1_6')
  5782. self.server.expect(JOB, {'job_state': 'R',
  5783. 'Resource_List.mem': '6gb',
  5784. 'Resource_List.ncpus': 8,
  5785. 'Resource_List.nodect': 3,
  5786. 'Resource_List.select': self.job1_select,
  5787. 'Resource_List.place': self.job1_place,
  5788. 'schedselect': self.job1_schedselect,
  5789. 'exec_host': self.job1_exec_host,
  5790. 'exec_vnode': self.job1_exec_vnode}, id=jid)
  5791. # Verify remaining job resources
  5792. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  5793. "1:ncpus=2:mem=2097152kb"
  5794. newsel_esc = newsel.replace("+", "\+")
  5795. new_exec_vnode = self.job1_exec_vnode.replace(
  5796. "%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
  5797. new_exec_vnode_esc = new_exec_vnode.replace(
  5798. "[", "\[").replace("]", "\]").replace(
  5799. "(", "\(").replace(")", "\)").replace("+", "\+")
  5800. self.server.expect(JOB, {'job_state': 'R',
  5801. 'Resource_List.mem': '5gb',
  5802. 'Resource_List.ncpus': 7,
  5803. 'Resource_List.select': newsel,
  5804. 'Resource_List.place': self.job1_place,
  5805. 'Resource_List.nodect': 3,
  5806. 'schedselect': newsel,
  5807. 'exec_host': self.job1_exec_host,
  5808. 'exec_vnode': new_exec_vnode},
  5809. id=jid, interval=1, max_attempts=30)
  5810. # Check account update ('u') record
  5811. self.match_accounting_log('u', jid, self.job1_exec_host_esc,
  5812. self.job1_exec_vnode_esc, "6gb", 8, 3,
  5813. self.job1_place,
  5814. self.job1_sel_esc)
  5815. # Check to make sure 'c' (next) record got generated
  5816. self.match_accounting_log('c', jid, self.job1_exec_host_esc,
  5817. new_exec_vnode_esc, "5242880kb",
  5818. 7, 3, self.job1_place, newsel_esc)
  5819. def test_release_nodes_on_stageout_diffvalues(self):
  5820. """
  5821. Set release_nodes_on_stageout to different values other than
  5822. true or false
  5823. """
  5824. a = {ATTR_W: "release_nodes_on_stageout=-1"}
  5825. j = Job(TEST_USER, a)
  5826. try:
  5827. self.server.submit(j)
  5828. except PtlException as e:
  5829. self.assertTrue("illegal -W value" in e.msg[0])
  5830. a = {ATTR_W: "release_nodes_on_stageout=11"}
  5831. j = Job(TEST_USER, a)
  5832. try:
  5833. self.server.submit(j)
  5834. except PtlException as e:
  5835. self.assertTrue("illegal -W value" in e.msg[0])
  5836. a = {ATTR_W: "release_nodes_on_stageout=tru"}
  5837. j = Job(TEST_USER, a)
  5838. try:
  5839. self.server.submit(j)
  5840. except PtlException as e:
  5841. self.assertTrue("illegal -W value" in e.msg[0])
  5842. def test_resc_accumulation(self):
  5843. """
  5844. Test that resources gets accumulated when a mom is released
  5845. """
  5846. # skip this test due to PP-972
  5847. self.skip_test(reason="Test fails due to PP-972")
  5848. self.server.manager(MGR_CMD_SET, SERVER,
  5849. {'job_history_enable': "true"}, sudo=True)
  5850. # Create custom resources
  5851. attr = {}
  5852. attr['type'] = 'float'
  5853. attr['flag'] = 'nh'
  5854. r = 'foo_f'
  5855. self.server.manager(
  5856. MGR_CMD_CREATE, RSC, attr, id=r, runas=ROOT_USER, logerr=False)
  5857. attr1 = {}
  5858. attr1['type'] = 'size'
  5859. attr1['flag'] = 'nh'
  5860. r1 = 'foo_i'
  5861. self.server.manager(
  5862. MGR_CMD_CREATE, RSC, attr1, id=r1, runas=ROOT_USER, logerr=False)
  5863. hook_body = """
  5864. import pbs
  5865. e=pbs.event()
  5866. pbs.logmsg(pbs.LOG_DEBUG, "executed epilogue hook")
  5867. if e.job.in_ms_mom():
  5868. e.job.resources_used["vmem"] = pbs.size("9gb")
  5869. e.job.resources_used["foo_i"] = pbs.size(999)
  5870. e.job.resources_used["foo_f"] = 0.09
  5871. else:
  5872. e.job.resources_used["vmem"] = pbs.size("10gb")
  5873. e.job.resources_used["foo_i"] = pbs.size(1000)
  5874. e.job.resources_used["foo_f"] = 0.10
  5875. """
  5876. hook_name = "epi"
  5877. a = {'event': "execjob_epilogue", 'enabled': 'True'}
  5878. rv = self.server.create_import_hook(
  5879. hook_name,
  5880. a,
  5881. hook_body,
  5882. overwrite=True)
  5883. self.assertTrue(rv)
  5884. jid = self.create_and_submit_job('job1_5')
  5885. self.server.expect(JOB, {'job_state': "R"}, id=jid)
  5886. # Release hostC
  5887. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n7]
  5888. ret = self.server.du.run_cmd(self.server.hostname, cmd)
  5889. self.assertEqual(ret['rc'], 0)
  5890. self.momC.log_match("executed epilogue hook", max_attempts=10)
  5891. self.momC.log_match("DELETE_JOB2 received", max_attempts=10)
  5892. self.server.delete(jid, wait=True)
  5893. self.server.expect(JOB, {'job_state': 'F',
  5894. "resources_used.foo_i": "3kb",
  5895. "resources_used.foo_f": '0.29',
  5896. "resources_used.vmem": '29gb'}, id=jid)
  5897. @timeout(500)
  5898. def test_release_reservations(self):
  5899. """
  5900. Release nodes from a reservation will throw error. However
  5901. jobs inside reservation queue works as expected.
  5902. """
  5903. # Create a reservation on multiple nodes
  5904. start = int(time.time()) + 30
  5905. a = {'Resource_List.select': self.job1_select,
  5906. 'Resource_List.place': 'scatter',
  5907. 'reserve_start': start}
  5908. r = Reservation(TEST_USER, a)
  5909. rid = self.server.submit(r)
  5910. rid = rid.split('.')[0]
  5911. self.server.expect(RESV,
  5912. {'reserve_state': (MATCH_RE, "RESV_CONFIRMED|2")},
  5913. id=rid)
  5914. # Release a vnode from reservation. It will throw error.
  5915. cmd = [self.pbs_release_nodes_cmd, '-j', rid, self.n5]
  5916. r = self.server.du.run_cmd(self.server.hostname, cmd)
  5917. self.assertNotEqual(r['rc'], 0)
  5918. # Submit a job inside reservations and release vnode
  5919. a = {'queue': rid,
  5920. 'Resource_List.select': self.job1_select}
  5921. j = Job(TEST_USER, a)
  5922. jid = self.server.submit(j)
  5923. # Wait for the job to start
  5924. self.server.expect(JOB, {'job_state': 'R'},
  5925. offset=30, id=jid, max_attempts=30)
  5926. # Release vnodes from the job
  5927. cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5]
  5928. r = self.server.du.run_cmd(self.server.hostname, cmd)
  5929. self.assertEqual(r['rc'], 0)
  5930. # Verify the new schedselect
  5931. newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
  5932. "1:ncpus=2:mem=2097152kb"
  5933. new_exec_host = self.job1_exec_host
  5934. new_exec_vnode = self.job1_exec_vnode.replace(
  5935. "%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
  5936. self.server.expect(JOB, {'job_state': 'R',
  5937. 'Resource_List.mem': '5gb',
  5938. 'Resource_List.ncpus': 7,
  5939. 'Resource_List.select': newsel,
  5940. 'Resource_List.nodect': 3,
  5941. 'schedselect': newsel,
  5942. 'exec_host': new_exec_host,
  5943. 'exec_vnode': new_exec_vnode}, id=jid)