123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673 |
- # coding: utf-8
- # Copyright (C) 1994-2018 Altair Engineering, Inc.
- # For more information, contact Altair at www.altair.com.
- #
- # This file is part of the PBS Professional ("PBS Pro") software.
- #
- # Open Source License Information:
- #
- # PBS Pro is free software. You can redistribute it and/or modify it under the
- # terms of the GNU Affero General Public License as published by the Free
- # Software Foundation, either version 3 of the License, or (at your option) any
- # later version.
- #
- # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
- # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- # FOR A PARTICULAR PURPOSE.
- # See the GNU Affero General Public License for more details.
- #
- # You should have received a copy of the GNU Affero General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- #
- # Commercial License Information:
- #
- # For a copy of the commercial license terms and conditions,
- # go to: (http://www.pbspro.com/UserArea/agreement.html)
- # or contact the Altair Legal Department.
- #
- # Altair’s dual-license business model allows companies, individuals, and
- # organizations to create proprietary derivative works of PBS Pro and
- # distribute them - whether embedded or bundled with other software -
- # under a commercial license agreement.
- #
- # Use of Altair’s trademarks, including but not limited to "PBS™",
- # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
- # trademark licensing policies.
- from tests.functional import *
- def convert_time(fmt, tm, fixdate=False):
- """
- Convert given time stamp <tm> into given format <fmt>
- if fixdate is True add <space> before date if date is < 9
- (This is because to match output with ctime as qstat uses it)
- """
- rv = time.strftime(fmt, time.localtime(float(tm)))
- if ((sys.platform not in ('cygwin', 'win32')) and (fixdate)):
- rv = rv.split()
- date = int(rv[2])
- if date <= 9:
- date = ' ' + str(date)
- rv[2] = str(date)
- rv = ' '.join(rv)
- return rv
- def create_subjob_id(job_array_id, subjob_index):
- """
- insert subjob index into the square brackets of job array id
- """
- idx = job_array_id.find('[]')
- return job_array_id[:idx + 1] + str(subjob_index) + job_array_id[idx + 1:]
- class TestPbsReliableJobStartup(TestFunctional):
- """
- This tests the Reliable Job Startup Feature,
- where a job can be started with extra nodes,
- with node failures tolerated during job start
- (and even throughout the life of the job),
- before pruning job back to a set of healthy
- nodes that satisfy the original request.
- Custom parameters:
- moms: colon-separated hostnames of five MoMs
- """
- def pbs_nodefile_match_exec_host(self, jid, exec_host,
- schedselect=None):
- """
- Look into the PBS_NODEFILE on the first host listed in 'exec_host'
- and returns True if all host entries in 'exec_host' match the entries
- in the file. Otherwise, return False.
- # Look for 'mpiprocs' values in 'schedselect' (if not None), and
- # verify that the corresponding node hosts are appearing in
- # PBS_NODEFILE 'mpiprocs' number of times.
- """
- pbs_nodefile = os.path.join(self.server.
- pbs_conf['PBS_HOME'], 'aux', jid)
- # look for mpiprocs settings
- mpiprocs = []
- if schedselect is not None:
- for chunk in schedselect.split('+'):
- chl = chunk.split(':')
- for ch in chl:
- if ch.find('=') != -1:
- c = ch.split('=')
- if c[0] == "mpiprocs":
- mpiprocs.append(c[1])
- ehost = exec_host.split('+')
- first_host = ehost[0].split('/')[0]
- cmd = ['cat', pbs_nodefile]
- ret = self.server.du.run_cmd(first_host, cmd, sudo=False)
- ehost2 = []
- for h in ret['out']:
- ehost2.append(h.split('.')[0])
- ehost1 = []
- j = 0
- for eh in ehost:
- h = eh.split('/')
- if (len(mpiprocs) > 0):
- for _ in range(int(mpiprocs[j])):
- ehost1.append(h[0])
- else:
- ehost1.append(h[0])
- j += 1
- self.logger.info("EHOST1=%s" % (ehost1,))
- self.logger.info("EHOST2=%s" % (ehost2,))
- if cmp(ehost1, ehost2) != 0:
- return False
- return True
- def match_accounting_log(self, atype, jid, exec_host, exec_vnode,
- mem, ncpus, nodect, place, select):
- """
- This checks if there's an accounting log record 'atype' for
- job 'jid' containing the values given (i.e.
- Resource_List.exec_host, Resource_List.exec_vnode, etc...)
- This throws an exception upon encountering a non-matching
- accounting_logs entry.
- Some example values of 'atype' are: 'u' (update record due to
- release node request), 'c' (record containing the next
- set of resources to be used by a phased job as a result of
- release node request), 'e' (last update record for a phased job
- due to a release node request), 'E' (end of job record),
- 's' (secondary start record).
- """
- if atype == 'e':
- self.mom.log_match("Job;%s;Obit sent" % (jid,), n=100,
- max_attempts=5, interval=5)
- self.server.accounting_match(
- msg=".*%s;%s.*exec_host=%s" % (atype, jid, exec_host),
- regexp=True, n=20, max_attempts=3)
- self.server.accounting_match(
- msg=".*%s;%s.*exec_vnode=%s" % (atype, jid, exec_vnode),
- regexp=True, n=20, max_attempts=3)
- self.server.accounting_match(
- msg=".*%s;%s.*Resource_List\.mem=%s" % (atype, jid, mem),
- regexp=True, n=20, max_attempts=3)
- self.server.accounting_match(
- msg=".*%s;%s.*Resource_List\.ncpus=%d" % (atype, jid, ncpus),
- regexp=True, n=20, max_attempts=3)
- self.server.accounting_match(
- msg=".*%s;%s.*Resource_List\.nodect=%d" % (atype, jid, nodect),
- regexp=True, n=20, max_attempts=3)
- self.server.accounting_match(
- msg=".*%s;%s.*Resource_List\.place=%s" % (atype, jid, place),
- regexp=True, n=20, max_attempts=3)
- self.server.accounting_match(
- msg=".*%s;%s.*Resource_List\.select=%s" % (atype, jid, select),
- regexp=True, n=20, max_attempts=3)
- if (atype != 'c') and (atype != 'S') and (atype != 's'):
- self.server.accounting_match(
- msg=".*%s;%s.*resources_used\." % (atype, jid),
- regexp=True, n=20, max_attempts=3)
- def match_vnode_status(self, vnode_list, state, jobs=None, ncpus=None,
- mem=None):
- """
- Given a list of vnode names in 'vnode_list', check to make
- sure each vnode's state, jobs string, resources_assigned.mem,
- and resources_assigned.ncpus match the passed arguments.
- This will throw an exception if a match is not found.
- """
- for vn in vnode_list:
- dict_match = {'state': state}
- if jobs is not None:
- dict_match['jobs'] = jobs
- if ncpus is not None:
- dict_match['resources_assigned.ncpus'] = ncpus
- if mem is not None:
- dict_match['resources_assigned.mem'] = mem
- self.server.expect(VNODE, dict_match, id=vn)
- def create_and_submit_job(self, job_type, attribs=None):
- """
- create the job object and submit it to the server
- based on 'job_type' and attributes list 'attribs'.
- """
- if attribs:
- retjob = Job(TEST_USER, attrs=attribs)
- else:
- retjob = Job(TEST_USER)
- if job_type == 'job1':
- retjob.create_script(self.script['job1'])
- elif job_type == 'job1_2':
- retjob.create_script(self.script['job1_2'])
- elif job_type == 'job1_3':
- retjob.create_script(self.script['job1_3'])
- elif job_type == 'job1_4':
- retjob.create_script(self.script['job1_4'])
- elif job_type == 'job2':
- retjob.create_script(self.script['job2'])
- elif job_type == 'job3':
- retjob.create_script(self.script['job3'])
- elif job_type == 'job4':
- retjob.create_script(self.script['job4'])
- elif job_type == 'job5':
- retjob.create_script(self.script['job5'])
- elif job_type == 'jobA':
- retjob.create_script(self.script['jobA'])
- return self.server.submit(retjob)
- def setUp(self):
- if len(self.moms) != 5:
- cmt = "need 5 mom hosts: -p moms=<m1>:<m2>:<m3>:<m4>:<m5>"
- self.skip_test(reason=cmt)
- TestFunctional.setUp(self)
- Job.dflt_attributes[ATTR_k] = 'oe'
- self.server.cleanup_jobs(extend="force")
- self.momA = self.moms.values()[0]
- self.momB = self.moms.values()[1]
- self.momC = self.moms.values()[2]
- self.momD = self.moms.values()[3]
- self.momE = self.moms.values()[4]
- # Now start setting up and creating the vnodes
- self.server.manager(MGR_CMD_DELETE, NODE, None, "")
- # set node momA
- self.hostA = self.momA.shortname
- self.momA.delete_vnode_defs()
- vnode_prefix = self.hostA
- a = {'resources_available.mem': '1gb',
- 'resources_available.ncpus': '1'}
- vnodedef = self.momA.create_vnode_def(vnode_prefix, a, 4)
- self.assertNotEqual(vnodedef, None)
- self.momA.insert_vnode_def(vnodedef, 'vnode.def')
- self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA)
- # set node momB
- self.hostB = self.momB.shortname
- self.momB.delete_vnode_defs()
- vnode_prefix = self.hostB
- a = {'resources_available.mem': '1gb',
- 'resources_available.ncpus': '1'}
- vnodedef = self.momB.create_vnode_def(vnode_prefix, a, 5,
- usenatvnode=True)
- self.assertNotEqual(vnodedef, None)
- self.momB.insert_vnode_def(vnodedef, 'vnode.def')
- self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB)
- # set node momC
- # This one has no vnode definition.
- self.hostC = self.momC.shortname
- self.momC.delete_vnode_defs()
- self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC)
- a = {'resources_available.ncpus': 2,
- 'resources_available.mem': '2gb'}
- # set natural vnode of hostC
- self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostC,
- expect=True)
- # set node momD
- # This one has no vnode definition.
- self.hostD = self.momD.shortname
- self.momD.delete_vnode_defs()
- self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostD)
- a = {'resources_available.ncpus': 5,
- 'resources_available.mem': '5gb'}
- # set natural vnode of hostD
- self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostD,
- expect=True)
- # set node momE
- self.hostE = self.momE.shortname
- self.momE.delete_vnode_defs()
- vnode_prefix = self.hostE
- a = {'resources_available.mem': '1gb',
- 'resources_available.ncpus': '1'}
- vnodedef = self.momE.create_vnode_def(vnode_prefix, a, 5,
- usenatvnode=True)
- self.assertNotEqual(vnodedef, None)
- self.momE.insert_vnode_def(vnodedef, 'vnode.def')
- self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostE)
- # Various node names
- self.nA = self.hostA
- self.nAv0 = '%s[0]' % (self.hostA,)
- self.nAv1 = '%s[1]' % (self.hostA,)
- self.nAv2 = '%s[2]' % (self.hostA,)
- self.nAv3 = '%s[3]' % (self.hostA,)
- self.nB = self.hostB
- self.nBv0 = '%s[0]' % (self.hostB,)
- self.nBv1 = '%s[1]' % (self.hostB,)
- self.nBv2 = '%s[2]' % (self.hostB,)
- self.nBv3 = '%s[3]' % (self.hostB,)
- self.nC = self.hostC
- self.nD = self.hostD
- self.nE = self.hostE
- self.nEv0 = '%s[0]' % (self.hostE,)
- self.nEv1 = '%s[1]' % (self.hostE,)
- self.nEv2 = '%s[2]' % (self.hostE,)
- self.nEv3 = '%s[3]' % (self.hostE,)
- a = {'state': 'free', 'resources_available.ncpus': (GE, 1)}
- self.server.expect(VNODE, {'state=free': 17}, count=True,
- max_attempts=10, interval=2)
- if sys.platform in ('cygwin', 'win32'):
- SLEEP_CMD = "pbs-sleep"
- else:
- SLEEP_CMD = os.path.join(os.sep, "bin", "sleep")
- self.pbs_release_nodes_cmd = os.path.join(
- self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes')
- FIB37 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin',
- 'pbs_python') + \
- ' -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
- return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(37)\\\")"'
- self.fib37_value = 24157817
- FIB40 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin',
- 'pbs_python') + \
- ' -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
- return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(40)\\\")"'
- # job submission arguments
- self.script = {}
- # original select spec
- self.job1_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
- self.job1_place = "scatter"
- # incremented values at job start and just before actual launch
- self.job1_iselect = \
- "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=2:mem=2gb"
- self.job1_ischedselect = self.job1_iselect
- self.job1_iexec_host = "%s/0*0+%s/0*0+%s/0*3+%s/0*2+%s/0*0" % (
- self.nA, self.nB, self.nD, self.nC, self.nE)
- self.job1_iexec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- self.job1_isel_esc = self.job1_iselect.replace("+", "\+")
- self.job1_iexec_host_esc = self.job1_iexec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1_iexec_vnode_esc = self.job1_iexec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 1 upon successful job launch
- self.job1_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job1_schedselect = self.job1_select
- self.job1_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
- self.nA, self.nD, self.nE)
- self.job1_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- self.job1_sel_esc = self.job1_select.replace("+", "\+")
- self.job1_exec_host_esc = self.job1_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1_exec_vnode_esc = self.job1_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 2 upon successful job launch
- self.job1v2_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job1v2_schedselect = self.job1v2_select
- self.job1v2_exec_host = "%s/0*0+%s/0*3+%s/0*2" % (
- self.nA, self.nD, self.nC)
- self.job1v2_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:ncpus=2:mem=2097152kb)" % (self.nC,)
- self.job1v2_sel_esc = self.job1v2_select.replace("+", "\+")
- self.job1v2_exec_host_esc = self.job1v2_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1v2_exec_vnode_esc = self.job1v2_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 3 upon successful job launch
- self.job1v3_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job1v3_schedselect = self.job1v3_select
- self.job1v3_exec_host = "%s/0*0+%s/0*0+%s/0*0" % (
- self.nA, self.nB, self.nE)
- self.job1v3_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- self.job1v3_sel_esc = self.job1v3_select.replace("+", "\+")
- self.job1v3_exec_host_esc = self.job1v3_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1v3_exec_vnode_esc = self.job1v3_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 4 upon successful job launch
- self.job1v4_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job1v4_schedselect = self.job1v4_select
- self.job1v4_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
- self.nA, self.nB, self.nD)
- self.job1v4_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=2:mem=2097152kb)" % (self.nD,)
- self.job1v4_sel_esc = self.job1v4_select.replace("+", "\+")
- self.job1v4_exec_host_esc = self.job1v4_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1v4_exec_vnode_esc = self.job1v4_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 5 upon successful job launch
- self.job1v5_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job1v5_schedselect = self.job1v5_select
- self.job1v5_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
- self.nA, self.nB, self.nC)
- self.job1v5_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=2:mem=2097152kb)" % (self.nC,)
- self.job1v5_sel_esc = self.job1v5_select.replace("+", "\+")
- self.job1v5_exec_host_esc = self.job1v5_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1v5_exec_vnode_esc = self.job1v5_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 6 upon successful job launch
- self.job1v6_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job1v6_select += "+1:ncpus=1:mem=1gb"
- self.job1v6_schedselect = self.job1v6_select
- self.job1v6_exec_host = "%s/0*0+%s/0*0+%s/0*2+%s/0" % (
- self.nA, self.nB, self.nC, self.nE)
- self.job1v6_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
- "(%s:mem=1048576kb:ncpus=1)" % (self.nE,)
- self.job1v6_sel_esc = self.job1v6_select.replace("+", "\+")
- self.job1v6_exec_host_esc = self.job1v6_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job1v6_exec_vnode_esc = self.job1v6_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.script['job1'] = """
- #PBS -l select=%s
- #PBS -l place=%s
- #PBS -W umask=022
- #PBS -S /bin/bash
- echo "$PBS_NODEFILE"
- cat $PBS_NODEFILE
- echo 'FIB TESTS'
- echo 'pbsdsh -n 1 fib 37'
- pbsdsh -n 1 -- %s
- echo 'pbsdsh -n 2 fib 37'
- pbsdsh -n 2 -- %s
- echo 'fib 37'
- %s
- echo 'HOSTNAME TESTS'
- echo 'pbsdsh -n 0 hostname'
- pbsdsh -n 0 -- hostname -s
- echo 'pbsdsh -n 1 hostname'
- pbsdsh -n 1 -- hostname -s
- echo 'pbsdsh -n 2 hostname'
- pbsdsh -n 2 -- hostname -s
- echo 'PBS_NODEFILE tests'
- for h in `cat $PBS_NODEFILE`
- do
- echo "HOST=$h"
- echo "pbs_tmrsh $h hostname"
- pbs_tmrsh $h hostname -s
- done
- """ % (self.job1_oselect, self.job1_place, FIB37, FIB37, FIB37)
- # original select spec
- self.jobA_oselect = "ncpus=1:mem=1gb+ncpus=1:mem=1gb+ncpus=1:mem=1gb"
- self.jobA_place = "scatter"
- # incremented values at job start and just before actual launch
- self.jobA_iselect = \
- "1:ncpus=1:mem=1gb+2:ncpus=1:mem=1gb+2:ncpus=1:mem=1gb"
- self.jobA_ischedselect = self.jobA_iselect
- self.jobA_iexec_host1 = "%s/0+%s/0+%s/0+%s/0+%s/0" % (
- self.nA, self.nB, self.nC, self.nD, self.nE)
- self.jobA_iexec_host2 = "%s/1+%s/1+%s/1+%s/1+%s/1" % (
- self.nA, self.nB, self.nC, self.nD, self.nE)
- self.jobA_iexec_host3 = "%s/2+%s/2+%s/0+%s/2+%s/0" % (
- self.nA, self.nB, self.nC, self.nD, self.nE)
- self.jobA_iexec_vnode1 = \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv0,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nB,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nC,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nD,) + \
- "(%s:ncpus=1:mem=1048576kb)" % (self.nE,)
- self.jobA_iexec_vnode2 = \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv1,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv0,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nC,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nD,) + \
- "(%s:ncpus=1:mem=1048576kb)" % (self.nEv0,)
- self.jobA_iexec_vnode3 = \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv2,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv1,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nC,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nD,) + \
- "(%s:ncpus=1:mem=1048576kb)" % (self.nE,)
- self.jobA_isel_esc = self.jobA_iselect.replace("+", "\+")
- self.jobA_iexec_host1_esc = self.jobA_iexec_host1.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.jobA_iexec_host2_esc = self.jobA_iexec_host2.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.jobA_iexec_host3_esc = self.jobA_iexec_host3.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.jobA_iexec_vnode1_esc = self.jobA_iexec_vnode1.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.jobA_iexec_vnode2_esc = self.jobA_iexec_vnode2.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.jobA_iexec_vnode3_esc = self.jobA_iexec_vnode3.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version 1 upon successful job launch
- self.jobA_select = \
- "1:ncpus=1:mem=1gb+1:ncpus=1:mem=1gb+1:ncpus=1:mem=1gb"
- self.jobA_schedselect = self.jobA_select
- self.jobA_exec_host1 = "%s/0+%s/0+%s/0" % (
- self.nA, self.nB, self.nD)
- self.jobA_exec_host2 = "%s/1+%s/1+%s/1" % (
- self.nA, self.nB, self.nD)
- self.jobA_exec_host3 = "%s/2+%s/2+%s/2" % (
- self.nA, self.nB, self.nD)
- self.jobA_exec_vnode1 = \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv0,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nB,) + \
- "(%s:ncpus=1:mem=1048576kb)" % (self.nD,)
- self.jobA_exec_vnode2 = \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv1,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv0,) + \
- "(%s:ncpus=1:mem=1048576kb)" % (self.nD,)
- self.jobA_exec_vnode3 = \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv2,) + \
- "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv1,) + \
- "(%s:ncpus=1:mem=1048576kb)" % (self.nD,)
- self.jobA_sel_esc = self.jobA_select.replace("+", "\+")
- self.jobA_exec_host1_esc = self.jobA_exec_host1.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.jobA_exec_host2_esc = self.jobA_exec_host2.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.jobA_exec_host3_esc = self.jobA_exec_host3.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.jobA_exec_vnode1_esc = self.jobA_exec_vnode1.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.jobA_exec_vnode2_esc = self.jobA_exec_vnode2.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.jobA_exec_vnode3_esc = self.jobA_exec_vnode3.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.script['jobA'] = """
- #PBS -J 1-3
- #PBS -l select=%s
- #PBS -l place=%s
- #PBS -S /bin/bash
- echo 'HOSTNAME TESTS'
- echo 'pbsdsh -n 0 hostname'
- pbsdsh -n 0 -- hostname -s
- echo 'pbsdsh -n 1 hostname'
- pbsdsh -n 1 -- hostname -s
- echo 'pbsdsh -n 2 hostname'
- pbsdsh -n 2 -- hostname -s
- sleep 180
- """ % (self.jobA_oselect, self.jobA_place)
- self.script['job1_3'] = """
- #PBS -l select=%s
- #PBS -l place=%s
- #PBS -W umask=022
- #PBS -S /bin/bash
- echo "$PBS_NODEFILE"
- cat $PBS_NODEFILE
- echo 'FIB TESTS'
- echo 'pbsdsh -n 2 fib 40'
- pbsdsh -n 2 -- %s
- echo 'fib 40'
- %s
- echo 'HOSTNAME TESTS'
- echo 'pbsdsh -n 0 hostname'
- pbsdsh -n 0 -- hostname -s
- echo 'pbsdsh -n 2 hostname'
- pbsdsh -n 2 -- hostname -s
- """ % (self.job1_oselect, self.job1_place, FIB40, FIB40)
- self.script['job1_2'] = """
- #PBS -l select=%s
- #PBS -l place=%s
- #PBS -W umask=022
- #PBS -S /bin/bash
- echo "$PBS_NODEFILE"
- cat $PBS_NODEFILE
- echo 'FIB TESTS'
- echo 'pbsdsh -n 2 fib 37'
- pbsdsh -n 2 -- %s
- echo 'fib 37'
- %s
- echo 'HOSTNAME TESTS'
- echo 'pbsdsh -n 0 hostname'
- pbsdsh -n 0 -- hostname -s
- echo 'pbsdsh -n 2 hostname'
- pbsdsh -n 2 -- hostname -s
- """ % (self.job1_oselect, self.job1_place, FIB37, FIB37)
- self.script['job1_3'] = """
- #PBS -l select=%s
- #PBS -l place=%s
- #PBS -W umask=022
- #PBS -S /bin/bash
- echo "$PBS_NODEFILE"
- cat $PBS_NODEFILE
- echo 'FIB TESTS'
- echo 'pbsdsh -n 2 fib 40'
- pbsdsh -n 2 -- %s
- echo 'fib 40'
- %s
- echo 'HOSTNAME TESTS'
- echo 'pbsdsh -n 0 hostname'
- pbsdsh -n 0 -- hostname -s
- echo 'pbsdsh -n 2 hostname'
- pbsdsh -n 2 -- hostname -s
- """ % (self.job1_oselect, self.job1_place, FIB40, FIB40)
- self.script['job1_4'] = """
- #PBS -l select=%s
- #PBS -l place=%s
- #PBS -W umask=022
- #PBS -S /bin/bash
- echo "$PBS_NODEFILE"
- cat $PBS_NODEFILE
- echo 'FIB TESTS'
- echo 'pbsdsh -n 1 fib 37'
- pbsdsh -n 1 -- %s
- echo 'pbsdsh -n 2 fib 37'
- pbsdsh -n 2 -- %s
- echo 'pbsdsh -n 3 fib 37'
- pbsdsh -n 3 -- %s
- echo 'fib 37'
- %s
- echo 'HOSTNAME TESTS'
- echo 'pbsdsh -n 0 hostname'
- pbsdsh -n 0 -- hostname -s
- echo 'pbsdsh -n 1 hostname'
- pbsdsh -n 1 -- hostname -s
- echo 'pbsdsh -n 2 hostname'
- pbsdsh -n 2 -- hostname -s
- echo 'pbsdsh -n 3 hostname'
- pbsdsh -n 3 -- hostname -s
- echo 'PBS_NODEFILE tests'
- for h in `cat $PBS_NODEFILE`
- do
- echo "HOST=$h"
- echo "pbs_tmrsh $h hostname"
- pbs_tmrsh $h hostname -s
- done
- """ % (self.job1_oselect, self.job1_place, FIB37, FIB37, FIB37, FIB37)
- # original select spec
- self.job2_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=0:mem=2gb"
- self.job2_place = "scatter"
- # incremented values at job start and just before actual launch
- self.job2_iselect = \
- "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=0:mem=2gb"
- self.job2_ischedselect = self.job2_iselect
- self.job2_iexec_host = "%s/0*0+%s/0*0+%s/0*3+%s/0*0+%s/0*0" % (
- self.nA, self.nB, self.nD, self.nC, self.nE)
- self.job2_iexec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:ncpus=0:mem=2097152kb)+" % (self.nC,) + \
- "(%s:mem=1048576kb:ncpus=0+" % (self.nE,) + \
- "%s:mem=1048576kb)" % (self.nEv0,)
- self.job2_isel_esc = self.job2_iselect.replace("+", "\+")
- self.job2_iexec_host_esc = self.job2_iexec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job2_iexec_vnode_esc = self.job2_iexec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # expected values version upon successful job launch
- self.job2_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=0:mem=2gb"
- self.job2_schedselect = self.job2_select
- self.job2_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
- self.nA, self.nD, self.nE)
- # ncpus=0 assigned hosts are not listed in $PBS_NODEFILE
- self.job2_exec_host_nfile = "%s/0*0+%s/0*3" % (
- self.nA, self.nD)
- self.job2_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:mem=1048576kb+" % (self.nE,) + \
- "%s:mem=1048576kb)" % (self.nEv0,)
- self.job2_sel_esc = self.job2_select.replace("+", "\+")
- self.job2_exec_host_esc = self.job2_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job2_exec_vnode_esc = self.job2_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.script['job2'] = \
- "#PBS -l select=" + self.job2_oselect + "\n" + \
- "#PBS -l place=" + self.job2_place + "\n" + \
- SLEEP_CMD + " 60\n"
- # Job with mpiprocs and ompthreads requested
- self.job3_oselect = \
- "ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
- "ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
- "ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
- self.job3_place = "scatter"
- # incremented values at job start and just before actual launch
- self.job3_iselect = \
- "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
- "2:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
- "2:ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
- self.job3_ischedselect = self.job3_iselect
- self.job3_iexec_host = \
- "%s/0*0+%s/0*0+%s/0*3+%s/0*2+%s/0*0" % (
- self.nA, self.nB, self.nD, self.nC, self.nE)
- self.job3_iexec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- # expected values version 6 upon successful job launch
- self.job3_select = \
- "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
- "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
- "1:ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
- self.job3_schedselect = self.job3_select
- self.job3_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
- self.nA, self.nD, self.nE)
- self.job3_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- self.job3_sel_esc = self.job3_select.replace("+", "\+")
- self.job3_exec_host_esc = self.job3_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job3_exec_vnode_esc = self.job3_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.job3_isel_esc = self.job3_iselect.replace("+", "\+")
- self.job3_iexec_host_esc = self.job3_iexec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job3_iexec_vnode_esc = self.job3_iexec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.script['job3'] = \
- "#PBS -l select=" + self.job3_oselect + "\n" + \
- "#PBS -l place=" + self.job3_place + "\n" + \
- SLEEP_CMD + " 300\n"
- self.job3_ischedselect = self.job3_iselect
- self.job4_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
- self.job4_place = "scatter:excl"
- self.job4_iselect = \
- "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=2:mem=2gb"
- self.job4_ischedselect = self.job4_iselect
- self.job4_iexec_host = \
- "%s/0*0+%s/0*0+%s/0*3+%s/0*2+%s/0*0" % (
- self.nA, self.nB, self.nD, self.nC, self.nE)
- self.job4_iexec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- # expected values upon successful job launch
- self.job4_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job4_schedselect = "1:ncpus=3:mem=2gb+" + \
- "1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
- self.job4_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
- self.nA, self.nD, self.nE)
- self.job4_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
- "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
- self.script['job4'] = \
- "#PBS -l select=" + self.job4_oselect + "\n" + \
- "#PBS -l place=" + self.job4_place + "\n" + \
- SLEEP_CMD + " 300\n"
- self.job4_sel_esc = self.job4_select.replace("+", "\+")
- self.job4_exec_host_esc = self.job4_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job4_exec_vnode_esc = self.job4_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.job4_isel_esc = self.job4_iselect.replace("+", "\+")
- self.job4_iexec_host_esc = self.job4_iexec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job4_iexec_vnode_esc = self.job4_iexec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.job5_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
- self.job5_place = "free"
- self.job5_iselect = \
- "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=2:mem=2gb"
- self.job5_ischedselect = self.job5_iselect
- self.job5_iexec_host = \
- "%s/0*0+%s/0*0+%s/0*3+%s/1*0+%s/0*2" % (
- self.nA, self.nB, self.nD, self.nB, self.nC)
- self.job5_iexec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
- "(%s:mem=1048576kb+" % (self.nBv1,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv2,) + \
- "%s:ncpus=1)+" % (self.nBv3,) + \
- "(%s:ncpus=2:mem=2097152kb)" % (self.nC,)
- # expected values upon successful job launch
- self.job5_select = \
- "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=1:mem=1gb"
- self.job5_schedselect = self.job5_select
- self.job5_exec_host = "%s/0*0+%s/0*0+%s/1*0" % (
- self.nA, self.nB, self.nB)
- self.job5_exec_vnode = \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
- "%s:ncpus=1)+" % (self.nAv2) + \
- "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
- "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
- "%s:ncpus=1)+" % (self.nBv1,) + \
- "(%s:mem=1048576kb+" % (self.nBv1,) + \
- "%s:ncpus=1)" % (self.nBv2,)
- self.script['job5'] = \
- "#PBS -l select=" + self.job5_oselect + "\n" + \
- "#PBS -l place=" + self.job5_place + "\n" + \
- SLEEP_CMD + " 300\n"
- self.job5_sel_esc = self.job5_select.replace("+", "\+")
- self.job5_exec_host_esc = self.job5_exec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job5_exec_vnode_esc = self.job5_exec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.job5_isel_esc = self.job5_iselect.replace("+", "\+")
- self.job5_iexec_host_esc = self.job5_iexec_host.replace(
- "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
- self.job5_iexec_vnode_esc = self.job5_iexec_vnode.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- # queuejob hooks used throughout the test
- self.qjob_hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
- # Save current select spec in resource 'site'
- e.job.Resource_List["site"] = str(e.job.Resource_List["select"])
- new_select = e.job.Resource_List["select"].increment_chunks(1)
- e.job.Resource_List["select"] = new_select
- e.job.tolerate_node_failures = "job_start"
- """
- self.qjob_hook_body2 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
- # Save current select spec in resource 'site'
- e.job.Resource_List["site"] = str(e.job.Resource_List["select"])
- new_select = e.job.Resource_List["select"].increment_chunks(1)
- e.job.Resource_List["select"] = new_select
- e.job.tolerate_node_failures = "all"
- """
- # begin hooks used throughout the test
- self.begin_hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- e.reject("bad node")
- """ % (self.nB,)
- # The below hook may not really be doing anything, but is
- # used in a test of the sister join job alarm time with
- # the hook's alarm value.
- self.begin_hook_body2 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
- localnode=pbs.get_local_nodename()
- """
- self.begin_hook_body3 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- x
- """ % (self.nE,)
- self.begin_hook_body4 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- e.reject("bad node")
- """ % (self.nD,)
- self.begin_hook_body5 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- e.reject("bad node")
- """ % (self.nC,)
- # prologue hooks used throughout the test
- self.prolo_hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- e.reject("bad node")
- """ % (self.nC,)
- self.prolo_hook_body2 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prologue")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- x
- """ % (self.nC,)
- self.prolo_hook_body3 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
- localnode=pbs.get_local_nodename()
- """
- self.prolo_hook_body4 = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
- localnode=pbs.get_local_nodename()
- if e.job.in_ms_mom():
- pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
- if pj != None:
- pbs.logjobmsg(e.job.id, "prolo: job.exec_vnode=%s" % (pj.exec_vnode,))
- pbs.logjobmsg(e.job.id, "prolo: job.exec_host=%s" % (pj.exec_host,))
- pbs.logjobmsg(e.job.id,
- "prolo: job.schedselect=%s" % (pj.schedselect,))
- else:
- e.job.Hold_Types = pbs.hold_types("s")
- e.job.rerun()
- e.reject("unsuccessful at PROLOGUE")
- """
- self.prolo_hook_body5 = """
- import pbs
- import time
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
- if not e.job.in_ms_mom():
- pbs.logjobmsg(e.job.id, "sleeping for 30 secs")
- time.sleep(30)
- """
- # launch hooks used throughout the test
- self.launch_hook_body = """
- import pbs
- e=pbs.event()
- if 'PBS_NODEFILE' not in e.env:
- e.accept()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
- if e.job.in_ms_mom():
- pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
- if pj != None:
- pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
- pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
- pbs.logjobmsg(e.job.id,
- "launch: job.schedselect=%s" % (pj.schedselect,))
- else:
- e.job.Hold_Types = pbs.hold_types("s")
- e.job.rerun()
- e.reject("unsuccessful at LAUNCH")
- """
- self.launch_hook_body2 = """
- import pbs
- e=pbs.event()
- if 'PBS_NODEFILE' not in e.env:
- e.accept()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
- if e.job.in_ms_mom():
- new_sel = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=1:mem=1gb"
- pj = e.job.release_nodes(keep_select=new_sel)
- if pj != None:
- pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
- pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
- pbs.logjobmsg(e.job.id,
- "launch: job.schedselect=%s" % (pj.schedselect,))
- else:
- e.job.Hold_Types = pbs.hold_types("s")
- e.job.rerun()
- e.reject("unsuccessful at LAUNCH")
- """
- def tearDown(self):
- self.momA.signal("-CONT")
- self.momB.signal("-CONT")
- self.momC.signal("-CONT")
- self.momD.signal("-CONT")
- self.momE.signal("-CONT")
- self.momA.unset_mom_config('$sister_join_job_alarm', False)
- self.momA.unset_mom_config('$job_launch_delay', False)
- a = {'state': (DECR, 'offline')}
- self.server.manager(MGR_CMD_SET, NODE, a, self.momA.shortname)
- self.server.manager(MGR_CMD_SET, NODE, a, self.momB.shortname)
- self.server.manager(MGR_CMD_SET, NODE, a, self.momC.shortname)
- self.server.manager(MGR_CMD_SET, NODE, a, self.momD.shortname)
- self.server.manager(MGR_CMD_SET, NODE, a, self.momE.shortname)
- TestFunctional.tearDown(self)
- # Delete managers and operators if added
- attrib = ['operators', 'managers']
- self.server.manager(MGR_CMD_UNSET, SERVER, attrib, expect=True)
- @timeout(400)
- def test_t1(self):
- """
- Test tolerating job_start 2 node failures after adding
- extra nodes to the job, pruning
- job's assigned resources to match up to the original
- select spec, and offlining the failed vnodes.
- 1. Have a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=job_start
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Have an execjob_begin hook that fails (causes rejection)
- when executed by mom managing vnodes in (B).
- 4. Have an execjob_prologue hook that fails (causes rejection)
- when executed by mom managing vnodes in (C).
- 5. Then create an execjob_launch hook that offlines the failed
- nodes (B) and (C), and prunes back the job's exec_vnode
- assignment back to satisfying the original 3-node select
- spec, choosing only healthy nodes.
- 6. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(D)+(E)
- since (B) and (C) contain vnodes from failed moms.
- b. vnodes in (B) and (C) are now showing a state of
- "offline".
- c. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_body = """
- import pbs
- e=pbs.event()
- if 'PBS_NODEFILE' not in e.env:
- e.accept()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "launch:offline vnode_list_fail[" + v.name + "]")
- v.state = pbs.ND_OFFLINE
- if e.job.in_ms_mom():
- pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
- if pj != None:
- pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
- pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
- pbs.logjobmsg(e.job.id,
- "launch: job.schedselect=%s" % (pj.schedselect,))
- else:
- e.job.Hold_Types = pbs.hold_types("s")
- e.job.rerun()
- e.reject("unsuccessful at LAUNCH")
- """
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_schedselect,
- 'exec_host': self.job1_exec_host,
- 'exec_vnode': self.job1_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- self.match_vnode_status([self.nB, self.nBv0, self.nBv1, self.nC],
- 'offline')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_prologue hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match(
- "Job;%s;launch:offline vnode_list_fail[%s]" % (jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1_exec_host_esc,
- self.job1_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momD.shortname, self.momE.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname,
- self.momE.hostname, self.momE.hostname, self.momE.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- @timeout(400)
- def test_t2(self):
- """
- Test tolerating job_start 2 node failures after adding
- extra nodes to the job, pruning
- job's assigned resources to match up to the original
- select spec, without offlining the failed vnodes, and
- specifying mom config file options 'sister_join_job_alarm' and
- 'job_launch_delay'.
- 1. Set $sister_join_job_alarm and $job_launch_delay values
- in mom's config file.
- 2. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=job_start
- 3. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 4. Prior to submitting a job, suspend mom B. When job runs,
- momB won't be able to join the job, so it won't be considered
- as a "healthy" mom.
- 5. Have an execjob_begin hook that doesn't fail.
- 6. Have an execjob_prologue hook that fails (causes rejection)
- when executed by mom managing vnodes in (C).
- 7. Have an execjob_launch hook that prunes back the
- job's exec_vnode assignment back to satisfying the original
- 3-node select spec, choosing only healthy nodes.
- 8. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(D)+(E)
- since (B) and (C) contain vnodes from failed moms.
- b. vnodes in (B) and (C) are now showing a state of "free".
- c. Mom's log file will show explicit values to
- $sister_join_job_alarm and $job_launch_delay.
- c. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # set mom config options:
- sis_join_alarm = 45
- c = {'$sister_join_job_alarm': sis_join_alarm}
- self.momA.add_config(c)
- job_launch_delay = 40
- c = {'$job_launch_delay': job_launch_delay}
- self.momA.add_config(c)
- self.momA.signal("-HUP")
- self.momA.log_match(
- "sister_join_job_alarm;%d" % (sis_join_alarm,), max_attempts=5,
- interval=5)
- self.momA.log_match(
- "job_launch_delay;%d" % (job_launch_delay,),
- max_attempts=5, interval=5)
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body2)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # temporarily suspend momB, simulating a failed mom host.
- self.momB.signal("-STOP")
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- # Set time to start scanning logs
- stime = int(time.time())
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_schedselect,
- 'exec_host': self.job1_exec_host,
- 'exec_vnode': self.job1_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND,
- max_attempts=100)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Verify the logs and make sure sister_join_job_alarm is honored
- logs = self.mom.log_match(
- "Executing begin",
- allmatch=True, starttime=stime, max_attempts=8)
- log1 = logs[0][1]
- logs = self.mom.log_match(
- "Executing prolo",
- allmatch=True, starttime=stime, max_attempts=8)
- log2 = logs[0][1]
- pattern = '%m/%d/%Y %H:%M:%S'
- tmp = log1.split(';')
- # Convert the time into epoch time
- time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
- tmp = log2.split(';')
- time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
- diff = time2 - time1
- self.logger.info(
- "Time diff between begin hook and prologue hook is " +
- str(diff) + " seconds")
- # Leave a little wiggle room for slow systems
- self.assertTrue((diff >= sis_join_alarm) and
- diff <= (sis_join_alarm + 5))
- self.mom.log_match(
- "sister_join_job_alarm wait time %d secs exceeded" % (
- sis_join_alarm,), starttime=stime, max_attempts=8)
- # Verify the logs and make sure job_launch_delay is honored
- logs = self.mom.log_match(
- "Executing prolo",
- allmatch=True, starttime=stime, max_attempts=8)
- log1 = logs[0][1]
- logs = self.mom.log_match(
- "Executing launch",
- allmatch=True, starttime=stime, max_attempts=8)
- log2 = logs[0][1]
- pattern = '%m/%d/%Y %H:%M:%S'
- tmp = log1.split(';')
- # Convert the time into epoch time
- time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
- tmp = log2.split(';')
- time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
- diff = time2 - time1
- self.logger.info("Time diff between prolo hook and launch hook is " +
- str(diff) + " seconds")
- # Leave a little wiggle room for slow systems
- self.assertTrue((diff >= job_launch_delay) and
- diff <= (job_launch_delay + 3))
- self.momA.log_match(
- "not all prologue hooks to sister moms completed, " +
- "but job will proceed to execute", n=10)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1_exec_host_esc,
- self.job1_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momD.shortname, self.momE.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname,
- self.momE.hostname, self.momE.hostname, self.momE.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- @timeout(400)
- def test_t3(self):
- """
- Test: tolerating job_start 2 node failures after adding
- extra nodes to the job, pruning
- job's assigned resources to match up to the original
- select spec, without offlining the failed vnodes, and
- with 2 execjob_prologue hooks, with prologue hook1
- having alarm1 and prologue hook2 having alarm2.
- This also test the default value to sister_join_job_alarm.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=job_start
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Prior to submitting a job, suspend mom B. When job runs,
- momB won't be able to join the job, so it won't be considered
- as a "healthy" mom.
- 4. Have an execjob_prologue hook that doesn't fail any mom host
- with alarm=alarm1, order=1.
- 5. Have an execjob_prologue hook2 with alarm=alarm2, order=2,
- that fails (causes rejection) when executed by mom managing
- vnodes in (C).
- 6. Have an execjob_launch hook that prunes back the
- job's exec_vnode assignment back to satisfying the original
- 3-node select spec, choosing only healthy nodes.
- 7. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(D)+(E)
- since (B) and (C) contain vnodes from failed moms.
- b. vnodes in (B) and (C) are now showing a state of "free".
- c. Mom's log file shows the wait time from execjob_prologue
- hook1 execution and the execution of the exescjob_launch
- hook is no more than alarm1+alarm2.
- c. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body2)
- # instantiate execjob_prologue hook #1
- hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo1")
- localnode=pbs.get_local_nodename()
- """
- hook_event = "execjob_prologue"
- hook_name = "prolo1"
- alarm1 = 17
- a = {'event': hook_event, 'enabled': 'true', 'order': 1,
- 'alarm': alarm1}
- self.server.create_import_hook(hook_name, a, hook_body)
- # instantiate execjob_prologue hook #2
- hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo2")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "prolo2: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "prolo2: found vnode_list_fail[" + v.name + "]")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- x
- """ % (self.nC,)
- hook_event = "execjob_prologue"
- hook_name = "prolo2"
- alarm2 = 16
- a = {'event': hook_event, 'enabled': 'true', 'order': 2,
- 'alarm': alarm2}
- self.server.create_import_hook(hook_name, a, hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # temporarily suspend momB, simulating a failed mom host.
- self.momB.signal("-STOP")
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- # Set time to start scanning logs
- stime = int(time.time())
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_schedselect,
- 'exec_host': self.job1_exec_host,
- 'exec_vnode': self.job1_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND,
- max_attempts=100)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Verify the logs and make sure sister_join_job_alarm is honored
- logs = self.mom.log_match(
- "Executing begin",
- allmatch=True, starttime=stime, max_attempts=8)
- log1 = logs[0][1]
- logs = self.mom.log_match(
- "Executing prolo1",
- allmatch=True, starttime=stime, max_attempts=8)
- log2 = logs[0][1]
- pattern = '%m/%d/%Y %H:%M:%S'
- tmp = log1.split(';')
- # Convert the time into epoch time
- time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
- tmp = log2.split(';')
- time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
- diff = time2 - time1
- self.logger.info(
- "Time diff between begin hook and prologue hook is " +
- str(diff) + " seconds")
- # Leave a little wiggle room for slow systems
- # test default sister_join_job_alarm value
- sis_join_alarm = 30
- self.assertTrue((diff >= sis_join_alarm) and
- diff <= (sis_join_alarm + 5))
- self.mom.log_match(
- "sister_join_job_alarm wait time %d secs exceeded" % (
- sis_join_alarm,), starttime=stime, max_attempts=8)
- # Verify the logs and make sure job_launch_delay is honored
- logs = self.mom.log_match(
- "Executing prolo1",
- allmatch=True, starttime=stime, max_attempts=8)
- log1 = logs[0][1]
- logs = self.mom.log_match(
- "Executing launch",
- allmatch=True, starttime=stime, max_attempts=8)
- log2 = logs[0][1]
- pattern = '%m/%d/%Y %H:%M:%S'
- tmp = log1.split(';')
- # Convert the time into epoch time
- time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
- tmp = log2.split(';')
- time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
- diff = time2 - time1
- self.logger.info(
- "Time diff between prolo1 hook and launch hook is " +
- str(diff) + " seconds")
- # Leave a little wiggle room for slow systems
- job_launch_delay = alarm1 + alarm2
- self.assertTrue((diff >= job_launch_delay) and
- diff <= (job_launch_delay + 3))
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- self.momA.log_match(
- "not all prologue hooks to sister moms completed, " +
- "but job will proceed to execute", n=10)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo2: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match(
- "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1_exec_host_esc,
- self.job1_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momD.shortname, self.momE.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname,
- self.momE.hostname, self.momE.hostname, self.momE.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- @timeout(400)
- def test_t4(self):
- """
- Test: tolerating job_start 1 node failure that is used
- to satisfy a multi-chunk request, after adding
- extra nodes to the job, pruning
- job's assigned resources to match up to the original
- select spec.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=job_start
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Have an execjob_begin hook that fails (causes rejection)
- when executed by mom managing vnodes in (B).
- 4. Then create an execjob_launch hook that
- prunes back the job's exec_vnode assignment back to
- satisfying the original 3-node select spec,
- choosing only healthy nodes.
- 5. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(D)+(C)
- since (B) contain vnodes from failed moms.
- b. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1v2_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v2_schedselect,
- 'exec_host': self.job1v2_exec_host,
- 'exec_vnode': self.job1v2_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
- 2, '2097152kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nE,
- self.nEv0, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v2_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v2_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v2_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v2_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v2_exec_host_esc,
- self.job1v2_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v2_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momC.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momD.shortname, self.momC.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname,
- self.momC.hostname, self.momC.hostname, self.momC.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- @timeout(400)
- def test_t5(self):
- """
- Test: tolerating job_start 1 node failure used in a regular
- chunk after adding extra nodes to the job, pruning
- job's assigned resources to match up to the original
- select spec.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=job_start
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Have an execjob_prologue hook that fails (causes
- rejection) when executed by mom managing vnodes in (C).
- 4. Then create an execjob_launch hook that
- prunes back the job's exec_vnode assignment back to
- satisfying the original 3-node select spec,
- choosing only healthy nodes.
- 5. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(B)+(E)
- since (C) contain vnodes from failed moms.
- b. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body2)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1v3_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v3_schedselect,
- 'exec_host': self.job1v3_exec_host,
- 'exec_vnode': self.job1v3_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0,
- self.nE, self.nEv0], 'job-busy', jobs_assn1,
- 1, '1048576kb')
- self.match_vnode_status([self.nAv2, self.nBv1],
- 'job-busy', jobs_assn1, 1, '0kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nC, self.nD, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v3_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match(
- "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v3_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v3_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v3_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v3_exec_host_esc,
- self.job1v3_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v3_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momB.hostname, self.momE.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momB.shortname, self.momE.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momB.hostname, self.momB.hostname, self.momB.shortname,
- self.momE.hostname, self.momE.hostname, self.momE.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- def test_t6(self):
- """
- Test: tolerating job_start of 2 node failures used to
- satisfy the smaller chunks, after adding extra nodes
- to the job, pruning job's assigned resources to match up
- to the original select spec.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=job_start
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C. (C) and (E) are of smaller chunks than (B)
- and (D). For example:
- (D) = "(nadal:ncpus=3:mem=2097152kb)"
- (C) = "(lendl:ncpus=2:mem=2097152kb)"
- 3. Have an execjob_begin hook that fails (causes
- rejection) when executed by mom managing vnodes in (C).
- 4. Have an execjob_prologue hook that fails (causes
- rejection) when executed by mom managing vnodes in (E).
- 5. Then create an execjob_launch hook that
- prunes back the job's exec_vnode assignment back to
- satisfying the original 3-node select spec,
- choosing only healthy nodes.
- 6. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(B)+(D)
- since (C) and (E) contain vnodes from failed moms.
- Note that from (D), only allocate enough resources
- to satisfy the smaller third requested chunk.
- if (D) originally has "(nadal:ncpus=3:mem=2097152kb)",
- reassigning this would only be
- "(nadal:ncpus=2:mem=2097152kb)".
- b. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body3)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body2)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1v4_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v4_schedselect,
- 'exec_host': self.job1v4_exec_host,
- 'exec_vnode': self.job1v4_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn2,
- 2, '2097152kb')
- self.match_vnode_status([self.nAv2, self.nBv1],
- 'job-busy', jobs_assn1, 1, '0kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nC, self.nD, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v4_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostE), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostE) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nC, self.nE]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v4_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v4_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v4_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v4_exec_host_esc,
- self.job1v4_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v4_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momB.hostname, self.momD.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momB.shortname, self.momD.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momB.hostname, self.momB.hostname, self.momB.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- def test_t7(self):
- """
- Test: tolerating job_start of 2 node failures used to
- satisfy the larger chunks, after adding extra nodes
- to the job. Pruning job's assigned resources to match up
- to the original select spec would fail, as the
- unsatisfied chunk requests cannot be handled by
- by the remaining smaller sized nodes. The failure
- to prune job is followed by a pbs.event().rerun()
- action and a job hold. Also, this test
- setting tolerate_node_falures=none.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_prologue hook
- hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing prologue")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- x
- """ % (self.nD,)
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostD) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostD) +
- "as job is tolerant of node failures", n=10, regexp=True)
- self.momA.log_match("Job;%s;could not satisfy select chunk" % (jid,),
- n=10)
- self.momA.log_match("Job;%s;NEED chunks for keep_select" % (jid,),
- n=10)
- self.momA.log_match(
- "Job;%s;HAVE chunks from job's exec_vnode" % (jid,), n=10)
- self.momA.log_match("execjob_launch request rejected by 'launch'",
- n=10)
- errmsg = "unsuccessful at LAUNCH"
- self.momA.log_match("Job;%s;%s" % (jid, errmsg,), n=10)
- self.server.expect(JOB, {'job_state': 'H'},
- id=jid, interval=1, max_attempts=70)
- # turn off queuejob
- self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'false'}, 'qjob')
- # modify job so as to not tolerate_node_failures
- a = {ATTR_tolerate_node_failures: "none"}
- self.server.alterjob(jobid=jid, attrib=a)
- # release hold on job
- self.server.rlsjob(jobid=jid, holdtype='s')
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+could not JOIN_JOB" % (
- jid), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostE) +
- "is tolerant of node failures",
- regexp=True, n=10, existence=False, max_attempts=10)
- self.server.expect(JOB, {'job_state': 'H'},
- id=jid, interval=1, max_attempts=15)
- # turn off begin hook, leaving prologue hook in place
- self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'false'}, 'begin')
- # release hold on job
- self.server.rlsjob(jobid=jid, holdtype='s')
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.momA.log_match(
- "Job;%s;job_start_error.+could not IM_EXEC_PROLOGUE" % (jid,),
- n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True,
- existence=False, max_attempts=10)
- self.server.expect(JOB, {'job_state': 'H'},
- id=jid, interval=1, max_attempts=15)
- # turn off prologue hook, so only launch hook remains.
- self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'false'}, 'prolo')
- # release hold on job
- self.server.rlsjob(jobid=jid, holdtype='s')
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'none',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'exec_host': self.job1_iexec_host,
- 'exec_vnode': self.job1_iexec_vnode,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- # tolerate_node_failures=none and launch hook calls release_nodes()
- emsg = "no nodes released as job does not tolerate node failures"
- self.momA.log_match("%s: %s" % (jid, emsg), n=30)
- def test_t8(self):
- """
- Test tolerating node failures at job startup with no
- failed moms.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=all
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Have an execjob_begin, execjob_prologue hooks that don't
- fail any of the sister moms.
- when executed by mom managing vnodes in (C).
- 4. Then create an execjob_launch that prunes back the job's
- exec_vnode assignment back to satisfying the original 3-node
- select spec, choosing only healthy nodes.
- 5. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(B)+(C)
- b. The accounting log start record 'S' will reflect the
- select request where additional chunks were added, while
- the secondary start record 's' will reflect the assigned
- resources after pruning the original select request via
- the pbs.release_nodes(keep_select=...) call
- inside execjob_launch hook.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body2)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body3)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1v5_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v5_schedselect,
- 'exec_host': self.job1v5_exec_host,
- 'exec_vnode': self.job1v5_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nB, self.nBv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2, self.nBv1],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
- 2, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nE, self.nEv0, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v5_exec_host))
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v5_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v5_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v5_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v5_exec_host_esc,
- self.job1v5_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v5_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momB.hostname, self.momC.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momB.shortname, self.momC.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momB.hostname, self.momB.hostname, self.momB.shortname,
- self.momC.hostname, self.momC.hostname, self.momC.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- @timeout(400)
- def test_t9(self):
- """
- Test tolerating 'all' node failures at job startup and
- within the life of the job.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=all
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Have an execjob_begin hook that fails (causes rejection)
- when executed by mom managing vnodes in (B).
- 4. Have an execjob_prologue hook that fails (causes rejection)
- when executed by mom managing vnodes in (C).
- 5. Then create an execjob_launch that prunes back the job's
- exec_vnode assignment back to satisfying the original 3-node
- select spec, choosing only healthy nodes.
- 6. Now kill -KILL mom host hostD.
- 7. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(D)+(E)
- since (B) and (C) contain vnodes from failed moms.
- b. Job continues to run even after nodeD goes down with
- only an indication in mom_logs with the message:
- im_eof, Premature end of message from addr n stream 4
- """
- # set this so as to not linger on delaying job kill.
- c = {'$max_poll_downtime': 10}
- self.momA.add_config(c)
- # instantiate queuejob hook, tolerate_node_failure is set to 'all'
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body2)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1_2')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'all',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'all',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_schedselect,
- 'exec_host': self.job1_exec_host,
- 'exec_vnode': self.job1_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_prologue hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match(
- "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1_exec_host_esc,
- self.job1_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1_sel_esc)
- # temporarily suspend momD, simulating a failed mom host.
- self.momD.signal("-KILL")
- self.momA.log_match("im_eof, Premature end of message.+on stream 4",
- n=10, max_attempts=30, interval=2, regexp=True)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
- self.fib37_value, self.fib37_value, self.momA.shortname,
- self.momE.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- self.momD.start()
- def test_t10(self):
- """
- Test tolerating node failures at job startup but also
- cause a failure on one of the nodes after the job has
- started.
- 1. Submit a job that has been submitted with a select
- spec of 2 super-chunks say (A) and (B), and 1 chunk
- of (C), along with place spec of "scatter",
- resulting in the following assignment:
- exec_vnode = (A)+(B)+(C)
- and -Wtolerate_node_failures=all
- 2. Have a queuejob hook that adds 1 extra node to each
- chunk (except the MS (first) chunk), resulting in the
- assignment:
- exec_vnode = (A)+(B)+(D)+(C)+(E)
- where D mirrors super-chunk B specs while E mirrors
- chunk C.
- 3. Have an execjob_begin hook that fails (causes rejection)
- when executed by mom managing vnodes in (B).
- 4. Have an execjob_prologue hook that fails (causes rejection)
- when executed by mom managing vnodes in (C).
- 5. Then create an execjob_launch that prunes back the job's
- exec_vnode assignment back to satisfying the original 3-node
- select spec, choosing only healthy nodes.
- 6. Now kill -KILL mom host hostD.
- 7. Result:
- a. This results in the following reassignment of chunks:
- exec_vnode = (A)+(D)+(E)
- since (B) and (C) contain vnodes from failed moms.
- b. Job eventually aborts after nodeD goes down with
- an indication in mom_logs with the message:
- "im_eof, lost communication with <host>"
- "node EOF 1 (<host>)"
- "kill_job"
- """
- # set this so as to not linger on delaying job kill.
- c = {'$max_poll_downtime': 10}
- self.momA.add_config(c)
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1_3')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_schedselect,
- 'exec_host': self.job1_exec_host,
- 'exec_vnode': self.job1_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_prologue hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1_exec_host_esc,
- self.job1_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1_sel_esc)
- # temporarily suspend momD, simulating a failed mom host.
- self.momD.signal("-KILL")
- self.momA.log_match(
- "Job;%s;im_eof, lost communication with %s.+killing job now" % (
- jid, self.nD), n=10, max_attempts=30, interval=2, regexp=True)
- self.momA.log_match("Job;%s;kill_job" % (jid,),
- n=10, max_attempts=60, interval=2)
- self.momD.start()
- def test_t11(self):
- """
- Test: tolerating node failures at job startup with
- job having an ncpus=0 assignment. This ensures
- the hooks will have the info for the ncpus=0 chunks
- in pbs.event().vnode_list[].
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job2')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 9,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job2_iselect,
- 'Resource_List.site': self.job2_oselect,
- 'Resource_List.place': self.job2_place,
- 'schedselect': self.job2_ischedselect},
- max_attempts=10, id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 6,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job2_select,
- 'Resource_List.place': self.job2_place,
- 'schedselect': self.job2_schedselect,
- 'exec_host': self.job2_exec_host,
- 'exec_vnode': self.job2_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nE, self.nEv0],
- 'free', jobs_assn1, 0, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job2_exec_host_nfile))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_prologue hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match(
- "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job2_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job2_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job2_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job2_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job2_iexec_host_esc,
- self.job2_iexec_vnode_esc, "10gb", 9, 5,
- self.job2_place,
- self.job2_isel_esc)
- self.match_accounting_log('s', jid, self.job2_exec_host_esc,
- self.job2_exec_vnode_esc,
- "6gb", 6, 3,
- self.job2_place,
- self.job2_sel_esc)
- def test_t12(self):
- """
- Test: tolerating node failures at job startup with
- extra resources requested such as mpiprocs and
- ompthtreads which would affect content of $PBS_NODEFILE.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job3')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job3_iselect,
- 'Resource_List.site': self.job3_oselect,
- 'Resource_List.place': self.job3_place,
- 'schedselect': self.job3_ischedselect},
- max_attempts=10, id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job3_select,
- 'Resource_List.place': self.job3_place,
- 'schedselect': self.job3_schedselect,
- 'exec_host': self.job3_exec_host,
- 'exec_vnode': self.job3_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job3_exec_host,
- self.job3_schedselect))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_prologue hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job3_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job3_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job3_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job3_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job3_iexec_host_esc,
- self.job3_iexec_vnode_esc, "10gb", 13, 5,
- self.job3_place,
- self.job3_isel_esc)
- self.match_accounting_log('s', jid, self.job3_exec_host_esc,
- self.job3_exec_vnode_esc,
- "6gb", 8, 3,
- self.job3_place,
- self.job3_sel_esc)
- def test_t13(self):
- """
- Test: pbs.event().job.select.increment_chunks() method.
- """
- # instantiate queuejob hook
- hook_body = """
- import pbs
- e=pbs.event()
- sel=pbs.select("ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb")
- inp=2
- isel=sel.increment_chunks(inp)
- pbs.logmsg(pbs.LOG_DEBUG, "sel=%s" % (sel,))
- pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%d)=%s" % (inp,isel))
- inp="3"
- isel=sel.increment_chunks(inp)
- pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
- inp="23.5%"
- isel=sel.increment_chunks(inp)
- pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
- inp={0: 0, 1: 4, 2: "50%"}
- isel=sel.increment_chunks(inp)
- pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
- sel=pbs.select("5:ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb")
- pbs.logmsg(pbs.LOG_DEBUG, "sel=%s" % (sel,))
- inp={0: "50%", 1: "50%", 2: "50%"}
- isel=sel.increment_chunks(inp)
- pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
- """
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, hook_body)
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- j1 = Job(TEST_USER)
- j1.set_sleep_time(10)
- self.server.submit(j1)
- # Verify server_logs
- self.server.log_match(
- "sel=ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb", n=10)
- self.server.log_match(
- "sel.increment_chunks(2)=1:ncpus=3:mem=1gb+" +
- "3:ncpus=2:mem=2gb+4:ncpus=1:mem=3gb", n=10)
- self.server.log_match(
- "sel.increment_chunks(3)=1:ncpus=3:mem=1gb+" +
- "4:ncpus=2:mem=2gb+5:ncpus=1:mem=3gb", n=10)
- self.server.log_match(
- "sel.increment_chunks(23.5%)=1:ncpus=3:mem=1gb+" +
- "2:ncpus=2:mem=2gb+3:ncpus=1:mem=3gb", n=10)
- self.server.log_match(
- "sel.increment_chunks({0: 0, 1: 4, 2: \'50%\'})=1:ncpus=3:" +
- "mem=1gb+5:ncpus=2:mem=2gb+3:ncpus=1:mem=3gb", n=10)
- self.server.log_match(
- "sel=5:ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb",
- n=10)
- self.server.log_match(
- "sel.increment_chunks({0: \'50%\', 1: \'50%\', 2: \'50%\'})=" +
- "7:ncpus=3:mem=1gb+2:ncpus=2:mem=2gb+3:ncpus=1:mem=3gb", n=10)
- def test_t14(self):
- """
- Test: tolerating job_start of no node failures,
- but pruning job's assigned nodes to satisfy the original
- select spec + 1 additional node.
- Basically, given an original spec requiring
- 3 nodes, and a queuejob hook has added 2 more nodes,
- resulting in a new assignment:
- exec_vnode=(A)+(B)+(C)+(D)+(E) where
- (C) mirrors (B) and satisfy the second chunk, and (E)
- mirrors (D) and satisfy the third chunk.
- Now pruning the assigned nodes to need 4 nodes, would
- result in:
- exec_vnode=(A)+(B)+(D)+(e1)
- where (E) is a super-chunk of the form (e1+e2) and only
- need 'e1' part.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_launch hook
- hook_body = """
- import pbs
- e=pbs.event()
- if 'PBS_NODEFILE' not in e.env:
- e.accept()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
- if e.job.in_ms_mom():
- new_jsel = e.job.Resource_List["site"] + "+ncpus=1:mem=1gb"
- pj = e.job.release_nodes(keep_select=new_jsel)
- pbs.logmsg(pbs.LOG_DEBUG, "release_nodes(keep_select=%s)" % (new_jsel,))
- if pj != None:
- pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
- pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
- pbs.logjobmsg(e.job.id,
- "launch: job.schedselect=%s" % (pj.schedselect,))
- else:
- e.job.delete()
- msg = "unsuccessful at LAUNCH"
- e.reject("unsuccessful at LAUNCH")
- """
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1_4')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '7gb',
- 'Resource_List.ncpus': 9,
- 'Resource_List.nodect': 4,
- 'Resource_List.select': self.job1v6_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v6_schedselect,
- 'exec_host': self.job1v6_exec_host,
- 'exec_vnode': self.job1v6_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nB, self.nBv0, self.nE],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2, self.nBv1],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
- 2, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nEv0, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 9,
- 'resources_assigned.mem': '7340032kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 9,
- 'resources_assigned.mem': '7340032kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v6_exec_host))
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v6_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v6_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v6_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v6_exec_host_esc,
- self.job1v6_exec_vnode_esc,
- "7gb", 9, 4,
- self.job1_place,
- self.job1v6_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- pbsdsh -n 3 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- pbsdsh -n 3 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momB.hostname, self.momC.hostname,
- self.momE.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.fib37_value,
- self.momA.shortname, self.momB.shortname, self.momC.shortname,
- self.momE.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momB.hostname, self.momB.hostname, self.momB.shortname,
- self.momC.hostname, self.momC.hostname, self.momC.shortname,
- self.momE.hostname, self.momE.hostname, self.momE.shortname)
- self.logger.info("expected out=%s" % (expected_out,))
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.logger.info("job_out=%s" % (job_out,))
- self.assertEquals(job_out, expected_out)
- def test_t15(self):
- """
- Test: tolerating job_start of no node failures,
- but pruning job's assigned nodes to satisfy the original
- select spec minus 1 node, except one of the chunks is.
- unsatisfiable. This time, the action pbs.event().delete()
- action is specified on a failure to prune the job.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_launch hook
- hook_body = """
- import pbs
- e=pbs.event()
- if 'PBS_NODEFILE' not in e.env:
- e.accept()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
- for vn in e.vnode_list:
- v = e.vnode_list[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
- for vn in e.vnode_list_fail:
- v = e.vnode_list_fail[vn]
- pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
- if e.job.in_ms_mom():
- new_jsel ="ncpus=3:mem=2gb+ncpus=5:mem=3gb"
- pj = e.job.release_nodes(keep_select=new_jsel)
- pbs.logmsg(pbs.LOG_DEBUG, "release_nodes(keep_select=%s)" % (new_jsel,))
- if pj != None:
- pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
- pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
- pbs.logjobmsg(e.job.id,
- "launch: job.schedselect=%s" % (pj.schedselect,))
- else:
- e.job.delete()
- msg = "unsuccessful at LAUNCH"
- e.reject("unsuccessful at LAUNCH")
- """
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1_4')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.momA.log_match("Job;%s;could not satisfy select chunk" % (jid,),
- n=10, max_attempts=60, interval=2)
- self.momA.log_match("Job;%s;NEED chunks for keep_select" % (jid,),
- n=10)
- self.momA.log_match(
- "Job;%s;HAVE chunks from job's exec_vnode" % (jid,), n=10)
- self.momA.log_match("execjob_launch request rejected by 'launch'",
- n=10)
- errmsg = "unsuccessful at LAUNCH"
- self.momA.log_match("Job;%s;%s" % (jid, errmsg,), n=10)
- self.server.expect(JOB, 'queue', op=UNSET, id=jid)
- def test_t16(self):
- """
- Test: tolerating node failures at job startup with
- a job submitted with -l place="scatter:excl".
- Like jobs submitted with only "-l place=scatter"
- except the vnodes assigned would have a
- "job-exclusive" state.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_body = """
- import pbs
- e=pbs.event()
- pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
- localnode=pbs.get_local_nodename()
- if not e.job.in_ms_mom() and (localnode == '%s'):
- x
- """ % (self.nB,)
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job4')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job4_iselect,
- 'Resource_List.site': self.job4_oselect,
- 'Resource_List.place': self.job4_place,
- 'schedselect': self.job4_ischedselect},
- max_attempts=10, id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job4_select,
- 'Resource_List.place': self.job4_place,
- 'schedselect': self.job4_schedselect,
- 'exec_host': self.job4_exec_host,
- 'exec_vnode': self.job4_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1],
- 'job-exclusive', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nE, self.nEv0],
- 'job-exclusive', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-exclusive', jobs_assn1, 1, '0kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'job-exclusive', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nC,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job4_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
- "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
- "as job is tolerant of node failures", n=10, regexp=True)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_prologue hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job4_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job4_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job4_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job4_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job4_iexec_host_esc,
- self.job4_iexec_vnode_esc, "10gb", 13, 5,
- self.job4_place,
- self.job4_isel_esc)
- self.match_accounting_log('s', jid, self.job4_exec_host_esc,
- self.job4_exec_vnode_esc,
- "6gb", 8, 3,
- self.job4_place,
- self.job4_sel_esc)
- def test_t17(self):
- """
- Test: tolerating 1 node failure at job startup with
- a job submitted with -l place="free".
- Like jobs submitted with only "-l place=scatter"
- except some vnodes from the same mom would get
- allocated to satisfy different chunks.
- This test breaks apart one of the multi-chunks of
- the form (b1+b2+b3) so that upon reassignment,
- (b1+b2) is used.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body4)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body3)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body2)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job5')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job5_iselect,
- 'Resource_List.site': self.job5_oselect,
- 'Resource_List.place': self.job5_place,
- 'schedselect': self.job5_ischedselect},
- max_attempts=10, id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '5gb',
- 'Resource_List.ncpus': 7,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job5_select,
- 'Resource_List.place': self.job5_place,
- 'schedselect': self.job5_schedselect,
- 'exec_host': self.job5_exec_host,
- 'exec_vnode': self.job5_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status(
- [self.nAv0, self.nAv1, self.nB, self.nBv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2, self.nBv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- # due to free placement, job appears twice as it's been allocated
- # twice, one for mem only and the other for ncpus
- jobs_assn2 = "%s/0, %s/0" % (jid, jid)
- self.match_vnode_status([self.nBv1],
- 'job-busy', jobs_assn2, 1, '1048576kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nC, self.nD,
- self.nD, self.nE, self.nEv0, self.nEv1,
- self.nEv2, self.nEv3, self.nBv3], 'free')
- # Check server/queue counts.
- self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
- 'resources_assigned.mem': '5242880kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
- 'resources_assigned.mem': '5242880kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job5_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostD), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostD) +
- "is tolerant of node failures",
- regexp=True, n=10)
- # Check vnode_list[] parameter in execjob_prologue hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nD]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job5_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job5_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job5_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job5_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job5_iexec_host_esc,
- self.job5_iexec_vnode_esc, "10gb", 13, 5,
- self.job5_place,
- self.job5_isel_esc)
- self.match_accounting_log('s', jid, self.job5_exec_host_esc,
- self.job5_exec_vnode_esc,
- "5gb", 7, 3,
- self.job5_place,
- self.job5_sel_esc)
- def test_t18(self):
- """
- Test: having a node failure tolerant job waiting for healthy nodes
- to get rerun (i.e. qrerun). Upon qrerun, job should get
- killed, requeued, and restarted.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- jid = self.create_and_submit_job('job1')
- # job's substate is 41 (PRERUN) since it would be waiting for
- # healthy nodes being a node failure tolerant job.
- # With no prologue hook, MS would wait the default 30
- # seconds for healthy nodes.
- self.server.expect(JOB, {'job_state': 'R',
- 'substate': 41,
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'exec_host': self.job1_iexec_host,
- 'exec_vnode': self.job1_iexec_vnode,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0,
- self.nE, self.nEv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2, self.nBv1],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
- 2, '2097152kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nEv1, self.nEv2, self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 13,
- 'resources_assigned.mem': '10485760'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 13,
- 'resources_assigned.mem': '10485760'},
- id='workq', attrop=PTL_AND)
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.server.rerunjob(jid)
- self.server.expect(JOB, {'job_state': 'Q'}, id=jid)
- self.match_vnode_status([self.nA, self.nAv0, self.nAv1, self.nAv2,
- self.nAv3,
- self.nB, self.nBv0, self.nBv1, self.nBv2,
- self.nBv3, self.nC, self.nD, self.nE,
- self.nEv0, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 0,
- 'resources_assigned.mem': '0kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 0,
- 'resources_assigned.mem': '0kb'},
- id='workq', attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- # Now job should start running again
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1v2_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v2_schedselect,
- 'exec_host': self.job1v2_exec_host,
- 'exec_vnode': self.job1v2_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- self.match_vnode_status([self.nAv2],
- 'job-busy', jobs_assn1, 1, '0kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
- 2, '2097152kb')
- jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn3,
- 3, '2097152kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
- self.nBv1, self.nBv2, self.nBv3, self.nE,
- self.nEv0, self.nEv1, self.nEv2,
- self.nEv3], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v2_exec_host))
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v2_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v2_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v2_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v2_exec_host_esc,
- self.job1v2_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v2_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momC.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momD.shortname, self.momC.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname,
- self.momC.hostname, self.momC.hostname, self.momC.shortname)
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.assertEquals(job_out, expected_out)
- # Re-check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Re-check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v2_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v2_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v2_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v2_exec_host_esc,
- self.job1v2_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v2_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momD.hostname, self.momC.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momD.shortname, self.momC.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname,
- self.momC.hostname, self.momC.hostname, self.momC.shortname)
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.assertEquals(job_out, expected_out)
- def test_t19(self):
- """
- Test: having a node tolerant job waiting for healthy nodes
- to get issued a request to release nodes. The call
- to pbs_release_nodes would fail given that the job
- is not fully running yet, still figuring out which nodes
- assigned are deemed good.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- jid = self.create_and_submit_job('job1')
- # job's substate is 41 (PRERUN) since it would be waiting for
- # healthy nodes being a node failure tolerant job
- self.server.expect(JOB, {'job_state': 'R',
- 'substate': 41,
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'exec_host': self.job1_iexec_host,
- 'exec_vnode': self.job1_iexec_vnode,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- # Verify mom_logs
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- jid, self.hostB), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
- "is tolerant of node failures",
- regexp=True, n=10)
- # Run pbs_release_nodes on a job whose state is running but
- # substate is under PRERUN
- pbs_release_nodes_cmd = os.path.join(
- self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes')
- cmd = [pbs_release_nodes_cmd, '-j', jid, '-a']
- ret = self.server.du.run_cmd(self.server.hostname, cmd,
- runas=TEST_USER)
- self.assertNotEqual(ret['rc'], 0)
- self.assertTrue(ret['err'][0].startswith(
- 'pbs_release_nodes: Request invalid for state of job'))
- def test_t20(self):
- """
- Test: node failure tolerant job array, with multiple subjobs
- starting at the same time, and job's assigned resources
- are pruned to match up to the original select spec using
- an execjob_prologue hook this time.
- """
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_begin hook
- hook_event = "execjob_begin"
- hook_name = "begin"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.begin_hook_body5)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body4)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('jobA')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '5gb',
- 'Resource_List.ncpus': 5,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.jobA_iselect,
- 'Resource_List.site': self.jobA_oselect,
- 'Resource_List.place': self.jobA_place,
- 'schedselect': self.jobA_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.server.expect(JOB, {'job_state': 'B',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '5gb',
- 'Resource_List.ncpus': 5,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.jobA_iselect,
- 'Resource_List.site': self.jobA_oselect,
- 'Resource_List.place': self.jobA_place,
- 'schedselect': self.jobA_ischedselect},
- id=jid, attrop=PTL_AND)
- self.server.expect(JOB, {'job_state=R': 3}, extend='t')
- for idx in range(1, 4):
- sjid = create_subjob_id(jid, idx)
- if idx == 1:
- iexec_host_esc = self.jobA_iexec_host1_esc
- iexec_vnode = self.jobA_iexec_vnode1
- iexec_vnode_esc = self.jobA_iexec_vnode1_esc
- exec_host = self.jobA_exec_host1
- exec_host_esc = self.jobA_exec_host1_esc
- exec_vnode = self.jobA_exec_vnode1
- exec_vnode_esc = self.jobA_exec_vnode1_esc
- vnode_list = [self.nAv0, self.nB, self.nC,
- self.nD, self.nE]
- elif idx == 2:
- iexec_host_esc = self.jobA_iexec_host2_esc
- iexec_vnode = self.jobA_iexec_vnode2
- iexec_vnode_esc = self.jobA_iexec_vnode2_esc
- exec_host = self.jobA_exec_host2
- exec_host_esc = self.jobA_exec_host2_esc
- exec_vnode = self.jobA_exec_vnode2
- exec_vnode_esc = self.jobA_exec_vnode2_esc
- vnode_list = [self.nAv1, self.nBv0, self.nC,
- self.nD, self.nEv0]
- elif idx == 3:
- iexec_host_esc = self.jobA_iexec_host3_esc
- iexec_vnode = self.jobA_iexec_vnode3
- iexec_vnode_esc = self.jobA_iexec_vnode3_esc
- exec_host = self.jobA_exec_host3
- exec_host_esc = self.jobA_exec_host3_esc
- exec_vnode = self.jobA_exec_vnode3
- exec_vnode_esc = self.jobA_exec_vnode3_esc
- vnode_list = [self.nAv2, self.nBv1, self.nC,
- self.nD, self.nE]
- self.server.expect(JOB, {'job_state': 'R',
- 'substate': 41,
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '3gb',
- 'Resource_List.ncpus': 3,
- 'Resource_List.nodect': 3,
- 'exec_host': exec_host,
- 'exec_vnode': exec_vnode,
- 'Resource_List.select': self.jobA_select,
- 'Resource_List.site': self.jobA_oselect,
- 'Resource_List.place': self.jobA_place,
- 'schedselect': self.jobA_schedselect},
- id=sjid, attrop=PTL_AND)
- # Verify mom_logs
- sjid_esc = sjid.replace(
- "[", "\[").replace("]", "\]").replace("(", "\(").replace(
- ")", "\)").replace("+", "\+")
- self.momA.log_match(
- "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
- sjid_esc, self.hostC), n=10, regexp=True)
- self.momA.log_match(
- "Job;%s;ignoring error from %s.+as job " % (
- sjid_esc, self.hostC) + "is tolerant of node failures",
- regexp=True, n=10)
- for vn in vnode_list:
- self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
- sjid, vn), n=10)
- vnode_list_fail = [self.nC]
- for vn in vnode_list_fail:
- self.momA.log_match(
- "Job;%s;prolo: found vnode_list_fail[%s]" % (
- sjid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select)
- # call
- self.momA.log_match("Job;%s;prolo: job.exec_vnode=%s" % (
- sjid, exec_vnode), n=10)
- self.momA.log_match("Job;%s;prolo: job.schedselect=%s" % (
- sjid, self.jobA_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- sjid, iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- sjid, exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', sjid_esc, iexec_host_esc,
- iexec_vnode_esc, "5gb", 5, 5,
- self.jobA_place,
- self.jobA_isel_esc)
- self.match_accounting_log('s', sjid_esc, exec_host_esc,
- exec_vnode_esc,
- "3gb", 3, 3,
- self.jobA_place,
- self.jobA_sel_esc)
- @timeout(400)
- def test_t21(self):
- """
- Test: radio silent moms causing the primary mom to not get
- any acks from the sister moms executing prologue hooks.
- After some 'job_launch_delay' time has passed, primary
- mom will consider node hosts that have not acknowledged
- the prologue hook execution as failed hosts, and will
- not use their vnodes in the pruning of jobs.
- """
- job_launch_delay = 120
- c = {'$job_launch_delay': job_launch_delay}
- self.momA.add_config(c)
- # instantiate queuejob hook
- hook_event = "queuejob"
- hook_name = "qjob"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
- # instantiate execjob_prologue hook
- hook_event = "execjob_prologue"
- hook_name = "prolo"
- a = {'event': hook_event, 'enabled': 'true', 'alarm': 60}
- self.server.create_import_hook(hook_name, a, self.prolo_hook_body5)
- # instantiate execjob_launch hook
- hook_event = "execjob_launch"
- hook_name = "launch"
- a = {'event': hook_event, 'enabled': 'true'}
- self.server.create_import_hook(hook_name, a, self.launch_hook_body)
- # First, turn off scheduling
- a = {'scheduling': 'false'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- jid = self.create_and_submit_job('job1')
- # Job gets queued and reflects the incremented values from queuejob
- # hook
- self.server.expect(JOB, {'job_state': 'Q',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '10gb',
- 'Resource_List.ncpus': 13,
- 'Resource_List.nodect': 5,
- 'Resource_List.select': self.job1_iselect,
- 'Resource_List.site': self.job1_oselect,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1_ischedselect},
- id=jid, attrop=PTL_AND)
- a = {'scheduling': 'true'}
- self.server.manager(MGR_CMD_SET, SERVER, a)
- self.momE.log_match(
- "Job;%s;sleeping for 30 secs" % (jid, ), n=10)
- # temporarily suspend momE, simulating a radio silent mom.
- self.momE.signal("-STOP")
- self.momC.log_match(
- "Job;%s;sleeping for 30 secs" % (jid, ), n=10)
- # temporarily suspend momC, simulating a radio silent mom.
- self.momC.signal("-STOP")
- # sleep as long as the time primary mom waits for all
- # prologue hook acknowledgement from the sister moms
- self.logger.info("sleeping for %d secs waiting for healthy nodes" % (
- job_launch_delay,))
- time.sleep(job_launch_delay)
- # Job eventually launches reflecting the pruned back values
- # to the original select spec
- # There's a max_attempts=60 for it would take up to 60 seconds
- # for primary mom to wait for the sisters to join
- # (default $sister_join_job_alarm of 30 seconds) and to wait for
- # sisters to execjob_prologue hooks (default $job_launch_delay
- # value of 30 seconds)
- self.server.expect(JOB, {'job_state': 'R',
- 'tolerate_node_failures': 'job_start',
- 'Resource_List.mem': '6gb',
- 'Resource_List.ncpus': 8,
- 'Resource_List.nodect': 3,
- 'Resource_List.select': self.job1v4_select,
- 'Resource_List.place': self.job1_place,
- 'schedselect': self.job1v4_schedselect,
- 'exec_host': self.job1v4_exec_host,
- 'exec_vnode': self.job1v4_exec_vnode},
- id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
- thisjob = self.server.status(JOB, id=jid)
- if thisjob:
- job_output_file = thisjob[0]['Output_Path'].split(':')[1]
- # Check various vnode status.
- jobs_assn1 = "%s/0" % (jid,)
- self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0],
- 'job-busy', jobs_assn1, 1, '1048576kb')
- jobs_assn2 = "%s/0, %s/1" % (jid, jid)
- self.match_vnode_status([self.nD], 'free', jobs_assn2,
- 2, '2097152kb')
- self.match_vnode_status([self.nAv2, self.nBv1],
- 'job-busy', jobs_assn1, 1, '0kb')
- self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
- self.nC, self.nD, self.nEv1, self.nEv2,
- self.nEv3, self.nE, self.nEv0], 'free')
- # check server/queue counts
- self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- attrop=PTL_AND)
- self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
- 'resources_assigned.mem': '6291456kb'},
- id='workq', attrop=PTL_AND)
- self.assertTrue(
- self.pbs_nodefile_match_exec_host(jid, self.job1v4_exec_host))
- # Check vnode_list[] parameter in execjob_launch hook
- vnode_list = [self.nAv0, self.nAv1, self.nAv2,
- self.nB, self.nBv0, self.nBv1,
- self.nC, self.nD, self.nE, self.nEv0]
- for vn in vnode_list:
- self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
- jid, vn), n=10)
- # Check vnode_list_fail[] parameter in execjob_launch hook
- vnode_list_fail = [self.nC, self.nE, self.nEv0]
- for vn in vnode_list_fail:
- self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
- jid, vn), n=10)
- # Check result of pbs.event().job.release_nodes(keep_select) call
- self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
- jid, self.job1v4_exec_vnode), n=10)
- self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
- jid, self.job1v4_schedselect), n=10)
- self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
- jid, self.job1_iexec_vnode), n=10)
- self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
- jid, self.job1v4_exec_vnode), n=10)
- # Check accounting_logs
- self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
- self.job1_iexec_vnode_esc, "10gb", 13, 5,
- self.job1_place,
- self.job1_isel_esc)
- self.match_accounting_log('s', jid, self.job1v4_exec_host_esc,
- self.job1v4_exec_vnode_esc,
- "6gb", 8, 3,
- self.job1_place,
- self.job1v4_sel_esc)
- self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
- n=10, max_attempts=60, interval=2, regexp=True)
- self.momA.log_match("Job;%s;copy file request received" % (jid,),
- n=10, max_attempts=10, interval=2)
- # validate output
- expected_out = """/var/spool/pbs/aux/%s
- %s
- %s
- %s
- FIB TESTS
- pbsdsh -n 1 fib 37
- %d
- pbsdsh -n 2 fib 37
- %d
- fib 37
- %d
- HOSTNAME TESTS
- pbsdsh -n 0 hostname
- %s
- pbsdsh -n 1 hostname
- %s
- pbsdsh -n 2 hostname
- %s
- PBS_NODEFILE tests
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- HOST=%s
- pbs_tmrsh %s hostname
- %s
- """ % (jid, self.momA.hostname, self.momB.hostname, self.momD.hostname,
- self.fib37_value, self.fib37_value, self.fib37_value,
- self.momA.shortname, self.momB.shortname, self.momD.shortname,
- self.momA.hostname, self.momA.hostname, self.momA.shortname,
- self.momB.hostname, self.momB.hostname, self.momB.shortname,
- self.momD.hostname, self.momD.hostname, self.momD.shortname)
- job_out = ""
- with open(job_output_file, 'r') as fd:
- job_out = fd.read()
- self.assertEquals(job_out, expected_out)
|