pbs_reliable_job_startup.py 230 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. def convert_time(fmt, tm, fixdate=False):
  38. """
  39. Convert given time stamp <tm> into given format <fmt>
  40. if fixdate is True add <space> before date if date is < 9
  41. (This is because to match output with ctime as qstat uses it)
  42. """
  43. rv = time.strftime(fmt, time.localtime(float(tm)))
  44. if ((sys.platform not in ('cygwin', 'win32')) and (fixdate)):
  45. rv = rv.split()
  46. date = int(rv[2])
  47. if date <= 9:
  48. date = ' ' + str(date)
  49. rv[2] = str(date)
  50. rv = ' '.join(rv)
  51. return rv
  52. def create_subjob_id(job_array_id, subjob_index):
  53. """
  54. insert subjob index into the square brackets of job array id
  55. """
  56. idx = job_array_id.find('[]')
  57. return job_array_id[:idx + 1] + str(subjob_index) + job_array_id[idx + 1:]
  58. class TestPbsReliableJobStartup(TestFunctional):
  59. """
  60. This tests the Reliable Job Startup Feature,
  61. where a job can be started with extra nodes,
  62. with node failures tolerated during job start
  63. (and even throughout the life of the job),
  64. before pruning job back to a set of healthy
  65. nodes that satisfy the original request.
  66. Custom parameters:
  67. moms: colon-separated hostnames of five MoMs
  68. """
  69. def pbs_nodefile_match_exec_host(self, jid, exec_host,
  70. schedselect=None):
  71. """
  72. Look into the PBS_NODEFILE on the first host listed in 'exec_host'
  73. and returns True if all host entries in 'exec_host' match the entries
  74. in the file. Otherwise, return False.
  75. # Look for 'mpiprocs' values in 'schedselect' (if not None), and
  76. # verify that the corresponding node hosts are appearing in
  77. # PBS_NODEFILE 'mpiprocs' number of times.
  78. """
  79. pbs_nodefile = os.path.join(self.server.
  80. pbs_conf['PBS_HOME'], 'aux', jid)
  81. # look for mpiprocs settings
  82. mpiprocs = []
  83. if schedselect is not None:
  84. for chunk in schedselect.split('+'):
  85. chl = chunk.split(':')
  86. for ch in chl:
  87. if ch.find('=') != -1:
  88. c = ch.split('=')
  89. if c[0] == "mpiprocs":
  90. mpiprocs.append(c[1])
  91. ehost = exec_host.split('+')
  92. first_host = ehost[0].split('/')[0]
  93. cmd = ['cat', pbs_nodefile]
  94. ret = self.server.du.run_cmd(first_host, cmd, sudo=False)
  95. ehost2 = []
  96. for h in ret['out']:
  97. ehost2.append(h.split('.')[0])
  98. ehost1 = []
  99. j = 0
  100. for eh in ehost:
  101. h = eh.split('/')
  102. if (len(mpiprocs) > 0):
  103. for _ in range(int(mpiprocs[j])):
  104. ehost1.append(h[0])
  105. else:
  106. ehost1.append(h[0])
  107. j += 1
  108. self.logger.info("EHOST1=%s" % (ehost1,))
  109. self.logger.info("EHOST2=%s" % (ehost2,))
  110. if cmp(ehost1, ehost2) != 0:
  111. return False
  112. return True
  113. def match_accounting_log(self, atype, jid, exec_host, exec_vnode,
  114. mem, ncpus, nodect, place, select):
  115. """
  116. This checks if there's an accounting log record 'atype' for
  117. job 'jid' containing the values given (i.e.
  118. Resource_List.exec_host, Resource_List.exec_vnode, etc...)
  119. This throws an exception upon encountering a non-matching
  120. accounting_logs entry.
  121. Some example values of 'atype' are: 'u' (update record due to
  122. release node request), 'c' (record containing the next
  123. set of resources to be used by a phased job as a result of
  124. release node request), 'e' (last update record for a phased job
  125. due to a release node request), 'E' (end of job record),
  126. 's' (secondary start record).
  127. """
  128. if atype == 'e':
  129. self.mom.log_match("Job;%s;Obit sent" % (jid,), n=100,
  130. max_attempts=5, interval=5)
  131. self.server.accounting_match(
  132. msg=".*%s;%s.*exec_host=%s" % (atype, jid, exec_host),
  133. regexp=True, n=20, max_attempts=3)
  134. self.server.accounting_match(
  135. msg=".*%s;%s.*exec_vnode=%s" % (atype, jid, exec_vnode),
  136. regexp=True, n=20, max_attempts=3)
  137. self.server.accounting_match(
  138. msg=".*%s;%s.*Resource_List\.mem=%s" % (atype, jid, mem),
  139. regexp=True, n=20, max_attempts=3)
  140. self.server.accounting_match(
  141. msg=".*%s;%s.*Resource_List\.ncpus=%d" % (atype, jid, ncpus),
  142. regexp=True, n=20, max_attempts=3)
  143. self.server.accounting_match(
  144. msg=".*%s;%s.*Resource_List\.nodect=%d" % (atype, jid, nodect),
  145. regexp=True, n=20, max_attempts=3)
  146. self.server.accounting_match(
  147. msg=".*%s;%s.*Resource_List\.place=%s" % (atype, jid, place),
  148. regexp=True, n=20, max_attempts=3)
  149. self.server.accounting_match(
  150. msg=".*%s;%s.*Resource_List\.select=%s" % (atype, jid, select),
  151. regexp=True, n=20, max_attempts=3)
  152. if (atype != 'c') and (atype != 'S') and (atype != 's'):
  153. self.server.accounting_match(
  154. msg=".*%s;%s.*resources_used\." % (atype, jid),
  155. regexp=True, n=20, max_attempts=3)
  156. def match_vnode_status(self, vnode_list, state, jobs=None, ncpus=None,
  157. mem=None):
  158. """
  159. Given a list of vnode names in 'vnode_list', check to make
  160. sure each vnode's state, jobs string, resources_assigned.mem,
  161. and resources_assigned.ncpus match the passed arguments.
  162. This will throw an exception if a match is not found.
  163. """
  164. for vn in vnode_list:
  165. dict_match = {'state': state}
  166. if jobs is not None:
  167. dict_match['jobs'] = jobs
  168. if ncpus is not None:
  169. dict_match['resources_assigned.ncpus'] = ncpus
  170. if mem is not None:
  171. dict_match['resources_assigned.mem'] = mem
  172. self.server.expect(VNODE, dict_match, id=vn)
  173. def create_and_submit_job(self, job_type, attribs=None):
  174. """
  175. create the job object and submit it to the server
  176. based on 'job_type' and attributes list 'attribs'.
  177. """
  178. if attribs:
  179. retjob = Job(TEST_USER, attrs=attribs)
  180. else:
  181. retjob = Job(TEST_USER)
  182. if job_type == 'job1':
  183. retjob.create_script(self.script['job1'])
  184. elif job_type == 'job1_2':
  185. retjob.create_script(self.script['job1_2'])
  186. elif job_type == 'job1_3':
  187. retjob.create_script(self.script['job1_3'])
  188. elif job_type == 'job1_4':
  189. retjob.create_script(self.script['job1_4'])
  190. elif job_type == 'job2':
  191. retjob.create_script(self.script['job2'])
  192. elif job_type == 'job3':
  193. retjob.create_script(self.script['job3'])
  194. elif job_type == 'job4':
  195. retjob.create_script(self.script['job4'])
  196. elif job_type == 'job5':
  197. retjob.create_script(self.script['job5'])
  198. elif job_type == 'jobA':
  199. retjob.create_script(self.script['jobA'])
  200. return self.server.submit(retjob)
  201. def setUp(self):
  202. if len(self.moms) != 5:
  203. cmt = "need 5 mom hosts: -p moms=<m1>:<m2>:<m3>:<m4>:<m5>"
  204. self.skip_test(reason=cmt)
  205. TestFunctional.setUp(self)
  206. Job.dflt_attributes[ATTR_k] = 'oe'
  207. self.server.cleanup_jobs(extend="force")
  208. self.momA = self.moms.values()[0]
  209. self.momB = self.moms.values()[1]
  210. self.momC = self.moms.values()[2]
  211. self.momD = self.moms.values()[3]
  212. self.momE = self.moms.values()[4]
  213. # Now start setting up and creating the vnodes
  214. self.server.manager(MGR_CMD_DELETE, NODE, None, "")
  215. # set node momA
  216. self.hostA = self.momA.shortname
  217. self.momA.delete_vnode_defs()
  218. vnode_prefix = self.hostA
  219. a = {'resources_available.mem': '1gb',
  220. 'resources_available.ncpus': '1'}
  221. vnodedef = self.momA.create_vnode_def(vnode_prefix, a, 4)
  222. self.assertNotEqual(vnodedef, None)
  223. self.momA.insert_vnode_def(vnodedef, 'vnode.def')
  224. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA)
  225. # set node momB
  226. self.hostB = self.momB.shortname
  227. self.momB.delete_vnode_defs()
  228. vnode_prefix = self.hostB
  229. a = {'resources_available.mem': '1gb',
  230. 'resources_available.ncpus': '1'}
  231. vnodedef = self.momB.create_vnode_def(vnode_prefix, a, 5,
  232. usenatvnode=True)
  233. self.assertNotEqual(vnodedef, None)
  234. self.momB.insert_vnode_def(vnodedef, 'vnode.def')
  235. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB)
  236. # set node momC
  237. # This one has no vnode definition.
  238. self.hostC = self.momC.shortname
  239. self.momC.delete_vnode_defs()
  240. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC)
  241. a = {'resources_available.ncpus': 2,
  242. 'resources_available.mem': '2gb'}
  243. # set natural vnode of hostC
  244. self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostC,
  245. expect=True)
  246. # set node momD
  247. # This one has no vnode definition.
  248. self.hostD = self.momD.shortname
  249. self.momD.delete_vnode_defs()
  250. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostD)
  251. a = {'resources_available.ncpus': 5,
  252. 'resources_available.mem': '5gb'}
  253. # set natural vnode of hostD
  254. self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostD,
  255. expect=True)
  256. # set node momE
  257. self.hostE = self.momE.shortname
  258. self.momE.delete_vnode_defs()
  259. vnode_prefix = self.hostE
  260. a = {'resources_available.mem': '1gb',
  261. 'resources_available.ncpus': '1'}
  262. vnodedef = self.momE.create_vnode_def(vnode_prefix, a, 5,
  263. usenatvnode=True)
  264. self.assertNotEqual(vnodedef, None)
  265. self.momE.insert_vnode_def(vnodedef, 'vnode.def')
  266. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostE)
  267. # Various node names
  268. self.nA = self.hostA
  269. self.nAv0 = '%s[0]' % (self.hostA,)
  270. self.nAv1 = '%s[1]' % (self.hostA,)
  271. self.nAv2 = '%s[2]' % (self.hostA,)
  272. self.nAv3 = '%s[3]' % (self.hostA,)
  273. self.nB = self.hostB
  274. self.nBv0 = '%s[0]' % (self.hostB,)
  275. self.nBv1 = '%s[1]' % (self.hostB,)
  276. self.nBv2 = '%s[2]' % (self.hostB,)
  277. self.nBv3 = '%s[3]' % (self.hostB,)
  278. self.nC = self.hostC
  279. self.nD = self.hostD
  280. self.nE = self.hostE
  281. self.nEv0 = '%s[0]' % (self.hostE,)
  282. self.nEv1 = '%s[1]' % (self.hostE,)
  283. self.nEv2 = '%s[2]' % (self.hostE,)
  284. self.nEv3 = '%s[3]' % (self.hostE,)
  285. a = {'state': 'free', 'resources_available.ncpus': (GE, 1)}
  286. self.server.expect(VNODE, {'state=free': 17}, count=True,
  287. max_attempts=10, interval=2)
  288. if sys.platform in ('cygwin', 'win32'):
  289. SLEEP_CMD = "pbs-sleep"
  290. else:
  291. SLEEP_CMD = os.path.join(os.sep, "bin", "sleep")
  292. self.pbs_release_nodes_cmd = os.path.join(
  293. self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes')
  294. FIB37 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin',
  295. 'pbs_python') + \
  296. ' -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
  297. return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(37)\\\")"'
  298. self.fib37_value = 24157817
  299. FIB40 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin',
  300. 'pbs_python') + \
  301. ' -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
  302. return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(40)\\\")"'
  303. # job submission arguments
  304. self.script = {}
  305. # original select spec
  306. self.job1_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
  307. self.job1_place = "scatter"
  308. # incremented values at job start and just before actual launch
  309. self.job1_iselect = \
  310. "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=2:mem=2gb"
  311. self.job1_ischedselect = self.job1_iselect
  312. self.job1_iexec_host = "%s/0*0+%s/0*0+%s/0*3+%s/0*2+%s/0*0" % (
  313. self.nA, self.nB, self.nD, self.nC, self.nE)
  314. self.job1_iexec_vnode = \
  315. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  316. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  317. "%s:ncpus=1)+" % (self.nAv2) + \
  318. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  319. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  320. "%s:ncpus=1)+" % (self.nBv1,) + \
  321. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  322. "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
  323. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  324. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  325. self.job1_isel_esc = self.job1_iselect.replace("+", "\+")
  326. self.job1_iexec_host_esc = self.job1_iexec_host.replace(
  327. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  328. self.job1_iexec_vnode_esc = self.job1_iexec_vnode.replace(
  329. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  330. ")", "\)").replace("+", "\+")
  331. # expected values version 1 upon successful job launch
  332. self.job1_select = \
  333. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  334. self.job1_schedselect = self.job1_select
  335. self.job1_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
  336. self.nA, self.nD, self.nE)
  337. self.job1_exec_vnode = \
  338. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  339. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  340. "%s:ncpus=1)+" % (self.nAv2) + \
  341. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  342. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  343. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  344. self.job1_sel_esc = self.job1_select.replace("+", "\+")
  345. self.job1_exec_host_esc = self.job1_exec_host.replace(
  346. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  347. self.job1_exec_vnode_esc = self.job1_exec_vnode.replace(
  348. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  349. ")", "\)").replace("+", "\+")
  350. # expected values version 2 upon successful job launch
  351. self.job1v2_select = \
  352. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  353. self.job1v2_schedselect = self.job1v2_select
  354. self.job1v2_exec_host = "%s/0*0+%s/0*3+%s/0*2" % (
  355. self.nA, self.nD, self.nC)
  356. self.job1v2_exec_vnode = \
  357. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  358. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  359. "%s:ncpus=1)+" % (self.nAv2) + \
  360. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  361. "(%s:ncpus=2:mem=2097152kb)" % (self.nC,)
  362. self.job1v2_sel_esc = self.job1v2_select.replace("+", "\+")
  363. self.job1v2_exec_host_esc = self.job1v2_exec_host.replace(
  364. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  365. self.job1v2_exec_vnode_esc = self.job1v2_exec_vnode.replace(
  366. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  367. ")", "\)").replace("+", "\+")
  368. # expected values version 3 upon successful job launch
  369. self.job1v3_select = \
  370. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  371. self.job1v3_schedselect = self.job1v3_select
  372. self.job1v3_exec_host = "%s/0*0+%s/0*0+%s/0*0" % (
  373. self.nA, self.nB, self.nE)
  374. self.job1v3_exec_vnode = \
  375. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  376. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  377. "%s:ncpus=1)+" % (self.nAv2) + \
  378. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  379. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  380. "%s:ncpus=1)+" % (self.nBv1,) + \
  381. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  382. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  383. self.job1v3_sel_esc = self.job1v3_select.replace("+", "\+")
  384. self.job1v3_exec_host_esc = self.job1v3_exec_host.replace(
  385. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  386. self.job1v3_exec_vnode_esc = self.job1v3_exec_vnode.replace(
  387. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  388. ")", "\)").replace("+", "\+")
  389. # expected values version 4 upon successful job launch
  390. self.job1v4_select = \
  391. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  392. self.job1v4_schedselect = self.job1v4_select
  393. self.job1v4_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
  394. self.nA, self.nB, self.nD)
  395. self.job1v4_exec_vnode = \
  396. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  397. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  398. "%s:ncpus=1)+" % (self.nAv2) + \
  399. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  400. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  401. "%s:ncpus=1)+" % (self.nBv1,) + \
  402. "(%s:ncpus=2:mem=2097152kb)" % (self.nD,)
  403. self.job1v4_sel_esc = self.job1v4_select.replace("+", "\+")
  404. self.job1v4_exec_host_esc = self.job1v4_exec_host.replace(
  405. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  406. self.job1v4_exec_vnode_esc = self.job1v4_exec_vnode.replace(
  407. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  408. ")", "\)").replace("+", "\+")
  409. # expected values version 5 upon successful job launch
  410. self.job1v5_select = \
  411. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  412. self.job1v5_schedselect = self.job1v5_select
  413. self.job1v5_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
  414. self.nA, self.nB, self.nC)
  415. self.job1v5_exec_vnode = \
  416. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  417. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  418. "%s:ncpus=1)+" % (self.nAv2) + \
  419. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  420. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  421. "%s:ncpus=1)+" % (self.nBv1,) + \
  422. "(%s:ncpus=2:mem=2097152kb)" % (self.nC,)
  423. self.job1v5_sel_esc = self.job1v5_select.replace("+", "\+")
  424. self.job1v5_exec_host_esc = self.job1v5_exec_host.replace(
  425. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  426. self.job1v5_exec_vnode_esc = self.job1v5_exec_vnode.replace(
  427. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  428. ")", "\)").replace("+", "\+")
  429. # expected values version 6 upon successful job launch
  430. self.job1v6_select = \
  431. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  432. self.job1v6_select += "+1:ncpus=1:mem=1gb"
  433. self.job1v6_schedselect = self.job1v6_select
  434. self.job1v6_exec_host = "%s/0*0+%s/0*0+%s/0*2+%s/0" % (
  435. self.nA, self.nB, self.nC, self.nE)
  436. self.job1v6_exec_vnode = \
  437. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  438. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  439. "%s:ncpus=1)+" % (self.nAv2) + \
  440. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  441. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  442. "%s:ncpus=1)+" % (self.nBv1,) + \
  443. "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
  444. "(%s:mem=1048576kb:ncpus=1)" % (self.nE,)
  445. self.job1v6_sel_esc = self.job1v6_select.replace("+", "\+")
  446. self.job1v6_exec_host_esc = self.job1v6_exec_host.replace(
  447. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  448. self.job1v6_exec_vnode_esc = self.job1v6_exec_vnode.replace(
  449. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  450. ")", "\)").replace("+", "\+")
  451. self.script['job1'] = """
  452. #PBS -l select=%s
  453. #PBS -l place=%s
  454. #PBS -W umask=022
  455. #PBS -S /bin/bash
  456. echo "$PBS_NODEFILE"
  457. cat $PBS_NODEFILE
  458. echo 'FIB TESTS'
  459. echo 'pbsdsh -n 1 fib 37'
  460. pbsdsh -n 1 -- %s
  461. echo 'pbsdsh -n 2 fib 37'
  462. pbsdsh -n 2 -- %s
  463. echo 'fib 37'
  464. %s
  465. echo 'HOSTNAME TESTS'
  466. echo 'pbsdsh -n 0 hostname'
  467. pbsdsh -n 0 -- hostname -s
  468. echo 'pbsdsh -n 1 hostname'
  469. pbsdsh -n 1 -- hostname -s
  470. echo 'pbsdsh -n 2 hostname'
  471. pbsdsh -n 2 -- hostname -s
  472. echo 'PBS_NODEFILE tests'
  473. for h in `cat $PBS_NODEFILE`
  474. do
  475. echo "HOST=$h"
  476. echo "pbs_tmrsh $h hostname"
  477. pbs_tmrsh $h hostname -s
  478. done
  479. """ % (self.job1_oselect, self.job1_place, FIB37, FIB37, FIB37)
  480. # original select spec
  481. self.jobA_oselect = "ncpus=1:mem=1gb+ncpus=1:mem=1gb+ncpus=1:mem=1gb"
  482. self.jobA_place = "scatter"
  483. # incremented values at job start and just before actual launch
  484. self.jobA_iselect = \
  485. "1:ncpus=1:mem=1gb+2:ncpus=1:mem=1gb+2:ncpus=1:mem=1gb"
  486. self.jobA_ischedselect = self.jobA_iselect
  487. self.jobA_iexec_host1 = "%s/0+%s/0+%s/0+%s/0+%s/0" % (
  488. self.nA, self.nB, self.nC, self.nD, self.nE)
  489. self.jobA_iexec_host2 = "%s/1+%s/1+%s/1+%s/1+%s/1" % (
  490. self.nA, self.nB, self.nC, self.nD, self.nE)
  491. self.jobA_iexec_host3 = "%s/2+%s/2+%s/0+%s/2+%s/0" % (
  492. self.nA, self.nB, self.nC, self.nD, self.nE)
  493. self.jobA_iexec_vnode1 = \
  494. "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv0,) + \
  495. "(%s:ncpus=1:mem=1048576kb)+" % (self.nB,) + \
  496. "(%s:ncpus=1:mem=1048576kb)+" % (self.nC,) + \
  497. "(%s:ncpus=1:mem=1048576kb)+" % (self.nD,) + \
  498. "(%s:ncpus=1:mem=1048576kb)" % (self.nE,)
  499. self.jobA_iexec_vnode2 = \
  500. "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv1,) + \
  501. "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv0,) + \
  502. "(%s:ncpus=1:mem=1048576kb)+" % (self.nC,) + \
  503. "(%s:ncpus=1:mem=1048576kb)+" % (self.nD,) + \
  504. "(%s:ncpus=1:mem=1048576kb)" % (self.nEv0,)
  505. self.jobA_iexec_vnode3 = \
  506. "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv2,) + \
  507. "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv1,) + \
  508. "(%s:ncpus=1:mem=1048576kb)+" % (self.nC,) + \
  509. "(%s:ncpus=1:mem=1048576kb)+" % (self.nD,) + \
  510. "(%s:ncpus=1:mem=1048576kb)" % (self.nE,)
  511. self.jobA_isel_esc = self.jobA_iselect.replace("+", "\+")
  512. self.jobA_iexec_host1_esc = self.jobA_iexec_host1.replace(
  513. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  514. self.jobA_iexec_host2_esc = self.jobA_iexec_host2.replace(
  515. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  516. self.jobA_iexec_host3_esc = self.jobA_iexec_host3.replace(
  517. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  518. self.jobA_iexec_vnode1_esc = self.jobA_iexec_vnode1.replace(
  519. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  520. ")", "\)").replace("+", "\+")
  521. self.jobA_iexec_vnode2_esc = self.jobA_iexec_vnode2.replace(
  522. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  523. ")", "\)").replace("+", "\+")
  524. self.jobA_iexec_vnode3_esc = self.jobA_iexec_vnode3.replace(
  525. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  526. ")", "\)").replace("+", "\+")
  527. # expected values version 1 upon successful job launch
  528. self.jobA_select = \
  529. "1:ncpus=1:mem=1gb+1:ncpus=1:mem=1gb+1:ncpus=1:mem=1gb"
  530. self.jobA_schedselect = self.jobA_select
  531. self.jobA_exec_host1 = "%s/0+%s/0+%s/0" % (
  532. self.nA, self.nB, self.nD)
  533. self.jobA_exec_host2 = "%s/1+%s/1+%s/1" % (
  534. self.nA, self.nB, self.nD)
  535. self.jobA_exec_host3 = "%s/2+%s/2+%s/2" % (
  536. self.nA, self.nB, self.nD)
  537. self.jobA_exec_vnode1 = \
  538. "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv0,) + \
  539. "(%s:ncpus=1:mem=1048576kb)+" % (self.nB,) + \
  540. "(%s:ncpus=1:mem=1048576kb)" % (self.nD,)
  541. self.jobA_exec_vnode2 = \
  542. "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv1,) + \
  543. "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv0,) + \
  544. "(%s:ncpus=1:mem=1048576kb)" % (self.nD,)
  545. self.jobA_exec_vnode3 = \
  546. "(%s:ncpus=1:mem=1048576kb)+" % (self.nAv2,) + \
  547. "(%s:ncpus=1:mem=1048576kb)+" % (self.nBv1,) + \
  548. "(%s:ncpus=1:mem=1048576kb)" % (self.nD,)
  549. self.jobA_sel_esc = self.jobA_select.replace("+", "\+")
  550. self.jobA_exec_host1_esc = self.jobA_exec_host1.replace(
  551. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  552. self.jobA_exec_host2_esc = self.jobA_exec_host2.replace(
  553. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  554. self.jobA_exec_host3_esc = self.jobA_exec_host3.replace(
  555. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  556. self.jobA_exec_vnode1_esc = self.jobA_exec_vnode1.replace(
  557. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  558. ")", "\)").replace("+", "\+")
  559. self.jobA_exec_vnode2_esc = self.jobA_exec_vnode2.replace(
  560. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  561. ")", "\)").replace("+", "\+")
  562. self.jobA_exec_vnode3_esc = self.jobA_exec_vnode3.replace(
  563. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  564. ")", "\)").replace("+", "\+")
  565. self.script['jobA'] = """
  566. #PBS -J 1-3
  567. #PBS -l select=%s
  568. #PBS -l place=%s
  569. #PBS -S /bin/bash
  570. echo 'HOSTNAME TESTS'
  571. echo 'pbsdsh -n 0 hostname'
  572. pbsdsh -n 0 -- hostname -s
  573. echo 'pbsdsh -n 1 hostname'
  574. pbsdsh -n 1 -- hostname -s
  575. echo 'pbsdsh -n 2 hostname'
  576. pbsdsh -n 2 -- hostname -s
  577. sleep 180
  578. """ % (self.jobA_oselect, self.jobA_place)
  579. self.script['job1_3'] = """
  580. #PBS -l select=%s
  581. #PBS -l place=%s
  582. #PBS -W umask=022
  583. #PBS -S /bin/bash
  584. echo "$PBS_NODEFILE"
  585. cat $PBS_NODEFILE
  586. echo 'FIB TESTS'
  587. echo 'pbsdsh -n 2 fib 40'
  588. pbsdsh -n 2 -- %s
  589. echo 'fib 40'
  590. %s
  591. echo 'HOSTNAME TESTS'
  592. echo 'pbsdsh -n 0 hostname'
  593. pbsdsh -n 0 -- hostname -s
  594. echo 'pbsdsh -n 2 hostname'
  595. pbsdsh -n 2 -- hostname -s
  596. """ % (self.job1_oselect, self.job1_place, FIB40, FIB40)
  597. self.script['job1_2'] = """
  598. #PBS -l select=%s
  599. #PBS -l place=%s
  600. #PBS -W umask=022
  601. #PBS -S /bin/bash
  602. echo "$PBS_NODEFILE"
  603. cat $PBS_NODEFILE
  604. echo 'FIB TESTS'
  605. echo 'pbsdsh -n 2 fib 37'
  606. pbsdsh -n 2 -- %s
  607. echo 'fib 37'
  608. %s
  609. echo 'HOSTNAME TESTS'
  610. echo 'pbsdsh -n 0 hostname'
  611. pbsdsh -n 0 -- hostname -s
  612. echo 'pbsdsh -n 2 hostname'
  613. pbsdsh -n 2 -- hostname -s
  614. """ % (self.job1_oselect, self.job1_place, FIB37, FIB37)
  615. self.script['job1_3'] = """
  616. #PBS -l select=%s
  617. #PBS -l place=%s
  618. #PBS -W umask=022
  619. #PBS -S /bin/bash
  620. echo "$PBS_NODEFILE"
  621. cat $PBS_NODEFILE
  622. echo 'FIB TESTS'
  623. echo 'pbsdsh -n 2 fib 40'
  624. pbsdsh -n 2 -- %s
  625. echo 'fib 40'
  626. %s
  627. echo 'HOSTNAME TESTS'
  628. echo 'pbsdsh -n 0 hostname'
  629. pbsdsh -n 0 -- hostname -s
  630. echo 'pbsdsh -n 2 hostname'
  631. pbsdsh -n 2 -- hostname -s
  632. """ % (self.job1_oselect, self.job1_place, FIB40, FIB40)
  633. self.script['job1_4'] = """
  634. #PBS -l select=%s
  635. #PBS -l place=%s
  636. #PBS -W umask=022
  637. #PBS -S /bin/bash
  638. echo "$PBS_NODEFILE"
  639. cat $PBS_NODEFILE
  640. echo 'FIB TESTS'
  641. echo 'pbsdsh -n 1 fib 37'
  642. pbsdsh -n 1 -- %s
  643. echo 'pbsdsh -n 2 fib 37'
  644. pbsdsh -n 2 -- %s
  645. echo 'pbsdsh -n 3 fib 37'
  646. pbsdsh -n 3 -- %s
  647. echo 'fib 37'
  648. %s
  649. echo 'HOSTNAME TESTS'
  650. echo 'pbsdsh -n 0 hostname'
  651. pbsdsh -n 0 -- hostname -s
  652. echo 'pbsdsh -n 1 hostname'
  653. pbsdsh -n 1 -- hostname -s
  654. echo 'pbsdsh -n 2 hostname'
  655. pbsdsh -n 2 -- hostname -s
  656. echo 'pbsdsh -n 3 hostname'
  657. pbsdsh -n 3 -- hostname -s
  658. echo 'PBS_NODEFILE tests'
  659. for h in `cat $PBS_NODEFILE`
  660. do
  661. echo "HOST=$h"
  662. echo "pbs_tmrsh $h hostname"
  663. pbs_tmrsh $h hostname -s
  664. done
  665. """ % (self.job1_oselect, self.job1_place, FIB37, FIB37, FIB37, FIB37)
  666. # original select spec
  667. self.job2_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=0:mem=2gb"
  668. self.job2_place = "scatter"
  669. # incremented values at job start and just before actual launch
  670. self.job2_iselect = \
  671. "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=0:mem=2gb"
  672. self.job2_ischedselect = self.job2_iselect
  673. self.job2_iexec_host = "%s/0*0+%s/0*0+%s/0*3+%s/0*0+%s/0*0" % (
  674. self.nA, self.nB, self.nD, self.nC, self.nE)
  675. self.job2_iexec_vnode = \
  676. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  677. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  678. "%s:ncpus=1)+" % (self.nAv2) + \
  679. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  680. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  681. "%s:ncpus=1)+" % (self.nBv1,) + \
  682. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  683. "(%s:ncpus=0:mem=2097152kb)+" % (self.nC,) + \
  684. "(%s:mem=1048576kb:ncpus=0+" % (self.nE,) + \
  685. "%s:mem=1048576kb)" % (self.nEv0,)
  686. self.job2_isel_esc = self.job2_iselect.replace("+", "\+")
  687. self.job2_iexec_host_esc = self.job2_iexec_host.replace(
  688. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  689. self.job2_iexec_vnode_esc = self.job2_iexec_vnode.replace(
  690. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  691. ")", "\)").replace("+", "\+")
  692. # expected values version upon successful job launch
  693. self.job2_select = \
  694. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=0:mem=2gb"
  695. self.job2_schedselect = self.job2_select
  696. self.job2_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
  697. self.nA, self.nD, self.nE)
  698. # ncpus=0 assigned hosts are not listed in $PBS_NODEFILE
  699. self.job2_exec_host_nfile = "%s/0*0+%s/0*3" % (
  700. self.nA, self.nD)
  701. self.job2_exec_vnode = \
  702. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  703. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  704. "%s:ncpus=1)+" % (self.nAv2) + \
  705. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  706. "(%s:mem=1048576kb+" % (self.nE,) + \
  707. "%s:mem=1048576kb)" % (self.nEv0,)
  708. self.job2_sel_esc = self.job2_select.replace("+", "\+")
  709. self.job2_exec_host_esc = self.job2_exec_host.replace(
  710. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  711. self.job2_exec_vnode_esc = self.job2_exec_vnode.replace(
  712. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  713. ")", "\)").replace("+", "\+")
  714. self.script['job2'] = \
  715. "#PBS -l select=" + self.job2_oselect + "\n" + \
  716. "#PBS -l place=" + self.job2_place + "\n" + \
  717. SLEEP_CMD + " 60\n"
  718. # Job with mpiprocs and ompthreads requested
  719. self.job3_oselect = \
  720. "ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
  721. "ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
  722. "ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  723. self.job3_place = "scatter"
  724. # incremented values at job start and just before actual launch
  725. self.job3_iselect = \
  726. "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
  727. "2:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
  728. "2:ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  729. self.job3_ischedselect = self.job3_iselect
  730. self.job3_iexec_host = \
  731. "%s/0*0+%s/0*0+%s/0*3+%s/0*2+%s/0*0" % (
  732. self.nA, self.nB, self.nD, self.nC, self.nE)
  733. self.job3_iexec_vnode = \
  734. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  735. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  736. "%s:ncpus=1)+" % (self.nAv2) + \
  737. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  738. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  739. "%s:ncpus=1)+" % (self.nBv1,) + \
  740. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  741. "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
  742. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  743. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  744. # expected values version 6 upon successful job launch
  745. self.job3_select = \
  746. "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
  747. "1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
  748. "1:ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
  749. self.job3_schedselect = self.job3_select
  750. self.job3_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
  751. self.nA, self.nD, self.nE)
  752. self.job3_exec_vnode = \
  753. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  754. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  755. "%s:ncpus=1)+" % (self.nAv2) + \
  756. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  757. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  758. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  759. self.job3_sel_esc = self.job3_select.replace("+", "\+")
  760. self.job3_exec_host_esc = self.job3_exec_host.replace(
  761. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  762. self.job3_exec_vnode_esc = self.job3_exec_vnode.replace(
  763. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  764. ")", "\)").replace("+", "\+")
  765. self.job3_isel_esc = self.job3_iselect.replace("+", "\+")
  766. self.job3_iexec_host_esc = self.job3_iexec_host.replace(
  767. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  768. self.job3_iexec_vnode_esc = self.job3_iexec_vnode.replace(
  769. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  770. ")", "\)").replace("+", "\+")
  771. self.script['job3'] = \
  772. "#PBS -l select=" + self.job3_oselect + "\n" + \
  773. "#PBS -l place=" + self.job3_place + "\n" + \
  774. SLEEP_CMD + " 300\n"
  775. self.job3_ischedselect = self.job3_iselect
  776. self.job4_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
  777. self.job4_place = "scatter:excl"
  778. self.job4_iselect = \
  779. "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=2:mem=2gb"
  780. self.job4_ischedselect = self.job4_iselect
  781. self.job4_iexec_host = \
  782. "%s/0*0+%s/0*0+%s/0*3+%s/0*2+%s/0*0" % (
  783. self.nA, self.nB, self.nD, self.nC, self.nE)
  784. self.job4_iexec_vnode = \
  785. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  786. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  787. "%s:ncpus=1)+" % (self.nAv2) + \
  788. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  789. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  790. "%s:ncpus=1)+" % (self.nBv1,) + \
  791. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  792. "(%s:ncpus=2:mem=2097152kb)+" % (self.nC,) + \
  793. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  794. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  795. # expected values upon successful job launch
  796. self.job4_select = \
  797. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  798. self.job4_schedselect = "1:ncpus=3:mem=2gb+" + \
  799. "1:ncpus=3:mem=2gb+1:ncpus=2:mem=2gb"
  800. self.job4_exec_host = "%s/0*0+%s/0*3+%s/0*0" % (
  801. self.nA, self.nD, self.nE)
  802. self.job4_exec_vnode = \
  803. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  804. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  805. "%s:ncpus=1)+" % (self.nAv2) + \
  806. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  807. "(%s:mem=1048576kb:ncpus=1+" % (self.nE,) + \
  808. "%s:mem=1048576kb:ncpus=1)" % (self.nEv0,)
  809. self.script['job4'] = \
  810. "#PBS -l select=" + self.job4_oselect + "\n" + \
  811. "#PBS -l place=" + self.job4_place + "\n" + \
  812. SLEEP_CMD + " 300\n"
  813. self.job4_sel_esc = self.job4_select.replace("+", "\+")
  814. self.job4_exec_host_esc = self.job4_exec_host.replace(
  815. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  816. self.job4_exec_vnode_esc = self.job4_exec_vnode.replace(
  817. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  818. ")", "\)").replace("+", "\+")
  819. self.job4_isel_esc = self.job4_iselect.replace("+", "\+")
  820. self.job4_iexec_host_esc = self.job4_iexec_host.replace(
  821. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  822. self.job4_iexec_vnode_esc = self.job4_iexec_vnode.replace(
  823. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  824. ")", "\)").replace("+", "\+")
  825. self.job5_oselect = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
  826. self.job5_place = "free"
  827. self.job5_iselect = \
  828. "1:ncpus=3:mem=2gb+2:ncpus=3:mem=2gb+2:ncpus=2:mem=2gb"
  829. self.job5_ischedselect = self.job5_iselect
  830. self.job5_iexec_host = \
  831. "%s/0*0+%s/0*0+%s/0*3+%s/1*0+%s/0*2" % (
  832. self.nA, self.nB, self.nD, self.nB, self.nC)
  833. self.job5_iexec_vnode = \
  834. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  835. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  836. "%s:ncpus=1)+" % (self.nAv2) + \
  837. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  838. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  839. "%s:ncpus=1)+" % (self.nBv1,) + \
  840. "(%s:ncpus=3:mem=2097152kb)+" % (self.nD,) + \
  841. "(%s:mem=1048576kb+" % (self.nBv1,) + \
  842. "%s:mem=1048576kb:ncpus=1+" % (self.nBv2,) + \
  843. "%s:ncpus=1)+" % (self.nBv3,) + \
  844. "(%s:ncpus=2:mem=2097152kb)" % (self.nC,)
  845. # expected values upon successful job launch
  846. self.job5_select = \
  847. "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+1:ncpus=1:mem=1gb"
  848. self.job5_schedselect = self.job5_select
  849. self.job5_exec_host = "%s/0*0+%s/0*0+%s/1*0" % (
  850. self.nA, self.nB, self.nB)
  851. self.job5_exec_vnode = \
  852. "(%s:mem=1048576kb:ncpus=1+" % (self.nAv0,) + \
  853. "%s:mem=1048576kb:ncpus=1+" % (self.nAv1,) + \
  854. "%s:ncpus=1)+" % (self.nAv2) + \
  855. "(%s:mem=1048576kb:ncpus=1+" % (self.nB,) + \
  856. "%s:mem=1048576kb:ncpus=1+" % (self.nBv0,) + \
  857. "%s:ncpus=1)+" % (self.nBv1,) + \
  858. "(%s:mem=1048576kb+" % (self.nBv1,) + \
  859. "%s:ncpus=1)" % (self.nBv2,)
  860. self.script['job5'] = \
  861. "#PBS -l select=" + self.job5_oselect + "\n" + \
  862. "#PBS -l place=" + self.job5_place + "\n" + \
  863. SLEEP_CMD + " 300\n"
  864. self.job5_sel_esc = self.job5_select.replace("+", "\+")
  865. self.job5_exec_host_esc = self.job5_exec_host.replace(
  866. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  867. self.job5_exec_vnode_esc = self.job5_exec_vnode.replace(
  868. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  869. ")", "\)").replace("+", "\+")
  870. self.job5_isel_esc = self.job5_iselect.replace("+", "\+")
  871. self.job5_iexec_host_esc = self.job5_iexec_host.replace(
  872. "*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
  873. self.job5_iexec_vnode_esc = self.job5_iexec_vnode.replace(
  874. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  875. ")", "\)").replace("+", "\+")
  876. # queuejob hooks used throughout the test
  877. self.qjob_hook_body = """
  878. import pbs
  879. e=pbs.event()
  880. pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
  881. # Save current select spec in resource 'site'
  882. e.job.Resource_List["site"] = str(e.job.Resource_List["select"])
  883. new_select = e.job.Resource_List["select"].increment_chunks(1)
  884. e.job.Resource_List["select"] = new_select
  885. e.job.tolerate_node_failures = "job_start"
  886. """
  887. self.qjob_hook_body2 = """
  888. import pbs
  889. e=pbs.event()
  890. pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
  891. # Save current select spec in resource 'site'
  892. e.job.Resource_List["site"] = str(e.job.Resource_List["select"])
  893. new_select = e.job.Resource_List["select"].increment_chunks(1)
  894. e.job.Resource_List["select"] = new_select
  895. e.job.tolerate_node_failures = "all"
  896. """
  897. # begin hooks used throughout the test
  898. self.begin_hook_body = """
  899. import pbs
  900. e=pbs.event()
  901. pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
  902. localnode=pbs.get_local_nodename()
  903. if not e.job.in_ms_mom() and (localnode == '%s'):
  904. e.reject("bad node")
  905. """ % (self.nB,)
  906. # The below hook may not really be doing anything, but is
  907. # used in a test of the sister join job alarm time with
  908. # the hook's alarm value.
  909. self.begin_hook_body2 = """
  910. import pbs
  911. e=pbs.event()
  912. pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
  913. localnode=pbs.get_local_nodename()
  914. """
  915. self.begin_hook_body3 = """
  916. import pbs
  917. e=pbs.event()
  918. pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
  919. localnode=pbs.get_local_nodename()
  920. if not e.job.in_ms_mom() and (localnode == '%s'):
  921. x
  922. """ % (self.nE,)
  923. self.begin_hook_body4 = """
  924. import pbs
  925. e=pbs.event()
  926. pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
  927. localnode=pbs.get_local_nodename()
  928. if not e.job.in_ms_mom() and (localnode == '%s'):
  929. e.reject("bad node")
  930. """ % (self.nD,)
  931. self.begin_hook_body5 = """
  932. import pbs
  933. e=pbs.event()
  934. pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
  935. localnode=pbs.get_local_nodename()
  936. if not e.job.in_ms_mom() and (localnode == '%s'):
  937. e.reject("bad node")
  938. """ % (self.nC,)
  939. # prologue hooks used throughout the test
  940. self.prolo_hook_body = """
  941. import pbs
  942. e=pbs.event()
  943. pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
  944. for vn in e.vnode_list:
  945. v = e.vnode_list[vn]
  946. pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
  947. for vn in e.vnode_list_fail:
  948. v = e.vnode_list_fail[vn]
  949. pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
  950. localnode=pbs.get_local_nodename()
  951. if not e.job.in_ms_mom() and (localnode == '%s'):
  952. e.reject("bad node")
  953. """ % (self.nC,)
  954. self.prolo_hook_body2 = """
  955. import pbs
  956. e=pbs.event()
  957. pbs.logmsg(pbs.LOG_DEBUG, "Executing prologue")
  958. localnode=pbs.get_local_nodename()
  959. if not e.job.in_ms_mom() and (localnode == '%s'):
  960. x
  961. """ % (self.nC,)
  962. self.prolo_hook_body3 = """
  963. import pbs
  964. e=pbs.event()
  965. pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
  966. for vn in e.vnode_list:
  967. v = e.vnode_list[vn]
  968. pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
  969. for vn in e.vnode_list_fail:
  970. v = e.vnode_list_fail[vn]
  971. pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
  972. localnode=pbs.get_local_nodename()
  973. """
  974. self.prolo_hook_body4 = """
  975. import pbs
  976. e=pbs.event()
  977. pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
  978. for vn in e.vnode_list:
  979. v = e.vnode_list[vn]
  980. pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
  981. for vn in e.vnode_list_fail:
  982. v = e.vnode_list_fail[vn]
  983. pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
  984. localnode=pbs.get_local_nodename()
  985. if e.job.in_ms_mom():
  986. pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
  987. if pj != None:
  988. pbs.logjobmsg(e.job.id, "prolo: job.exec_vnode=%s" % (pj.exec_vnode,))
  989. pbs.logjobmsg(e.job.id, "prolo: job.exec_host=%s" % (pj.exec_host,))
  990. pbs.logjobmsg(e.job.id,
  991. "prolo: job.schedselect=%s" % (pj.schedselect,))
  992. else:
  993. e.job.Hold_Types = pbs.hold_types("s")
  994. e.job.rerun()
  995. e.reject("unsuccessful at PROLOGUE")
  996. """
  997. self.prolo_hook_body5 = """
  998. import pbs
  999. import time
  1000. e=pbs.event()
  1001. pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo")
  1002. for vn in e.vnode_list:
  1003. v = e.vnode_list[vn]
  1004. pbs.logjobmsg(e.job.id, "prolo: found vnode_list[" + v.name + "]")
  1005. for vn in e.vnode_list_fail:
  1006. v = e.vnode_list_fail[vn]
  1007. pbs.logjobmsg(e.job.id, "prolo: found vnode_list_fail[" + v.name + "]")
  1008. if not e.job.in_ms_mom():
  1009. pbs.logjobmsg(e.job.id, "sleeping for 30 secs")
  1010. time.sleep(30)
  1011. """
  1012. # launch hooks used throughout the test
  1013. self.launch_hook_body = """
  1014. import pbs
  1015. e=pbs.event()
  1016. if 'PBS_NODEFILE' not in e.env:
  1017. e.accept()
  1018. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  1019. for vn in e.vnode_list:
  1020. v = e.vnode_list[vn]
  1021. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  1022. for vn in e.vnode_list_fail:
  1023. v = e.vnode_list_fail[vn]
  1024. pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
  1025. if e.job.in_ms_mom():
  1026. pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
  1027. if pj != None:
  1028. pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
  1029. pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
  1030. pbs.logjobmsg(e.job.id,
  1031. "launch: job.schedselect=%s" % (pj.schedselect,))
  1032. else:
  1033. e.job.Hold_Types = pbs.hold_types("s")
  1034. e.job.rerun()
  1035. e.reject("unsuccessful at LAUNCH")
  1036. """
  1037. self.launch_hook_body2 = """
  1038. import pbs
  1039. e=pbs.event()
  1040. if 'PBS_NODEFILE' not in e.env:
  1041. e.accept()
  1042. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  1043. for vn in e.vnode_list:
  1044. v = e.vnode_list[vn]
  1045. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  1046. for vn in e.vnode_list_fail:
  1047. v = e.vnode_list_fail[vn]
  1048. pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
  1049. if e.job.in_ms_mom():
  1050. new_sel = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=1:mem=1gb"
  1051. pj = e.job.release_nodes(keep_select=new_sel)
  1052. if pj != None:
  1053. pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
  1054. pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
  1055. pbs.logjobmsg(e.job.id,
  1056. "launch: job.schedselect=%s" % (pj.schedselect,))
  1057. else:
  1058. e.job.Hold_Types = pbs.hold_types("s")
  1059. e.job.rerun()
  1060. e.reject("unsuccessful at LAUNCH")
  1061. """
  1062. def tearDown(self):
  1063. self.momA.signal("-CONT")
  1064. self.momB.signal("-CONT")
  1065. self.momC.signal("-CONT")
  1066. self.momD.signal("-CONT")
  1067. self.momE.signal("-CONT")
  1068. self.momA.unset_mom_config('$sister_join_job_alarm', False)
  1069. self.momA.unset_mom_config('$job_launch_delay', False)
  1070. a = {'state': (DECR, 'offline')}
  1071. self.server.manager(MGR_CMD_SET, NODE, a, self.momA.shortname)
  1072. self.server.manager(MGR_CMD_SET, NODE, a, self.momB.shortname)
  1073. self.server.manager(MGR_CMD_SET, NODE, a, self.momC.shortname)
  1074. self.server.manager(MGR_CMD_SET, NODE, a, self.momD.shortname)
  1075. self.server.manager(MGR_CMD_SET, NODE, a, self.momE.shortname)
  1076. TestFunctional.tearDown(self)
  1077. # Delete managers and operators if added
  1078. attrib = ['operators', 'managers']
  1079. self.server.manager(MGR_CMD_UNSET, SERVER, attrib, expect=True)
  1080. @timeout(400)
  1081. def test_t1(self):
  1082. """
  1083. Test tolerating job_start 2 node failures after adding
  1084. extra nodes to the job, pruning
  1085. job's assigned resources to match up to the original
  1086. select spec, and offlining the failed vnodes.
  1087. 1. Have a job that has been submitted with a select
  1088. spec of 2 super-chunks say (A) and (B), and 1 chunk
  1089. of (C), along with place spec of "scatter",
  1090. resulting in the following assignment:
  1091. exec_vnode = (A)+(B)+(C)
  1092. and -Wtolerate_node_failures=job_start
  1093. 2. Have a queuejob hook that adds 1 extra node to each
  1094. chunk (except the MS (first) chunk), resulting in the
  1095. assignment:
  1096. exec_vnode = (A)+(B)+(D)+(C)+(E)
  1097. where D mirrors super-chunk B specs while E mirrors
  1098. chunk C.
  1099. 3. Have an execjob_begin hook that fails (causes rejection)
  1100. when executed by mom managing vnodes in (B).
  1101. 4. Have an execjob_prologue hook that fails (causes rejection)
  1102. when executed by mom managing vnodes in (C).
  1103. 5. Then create an execjob_launch hook that offlines the failed
  1104. nodes (B) and (C), and prunes back the job's exec_vnode
  1105. assignment back to satisfying the original 3-node select
  1106. spec, choosing only healthy nodes.
  1107. 6. Result:
  1108. a. This results in the following reassignment of chunks:
  1109. exec_vnode = (A)+(D)+(E)
  1110. since (B) and (C) contain vnodes from failed moms.
  1111. b. vnodes in (B) and (C) are now showing a state of
  1112. "offline".
  1113. c. The accounting log start record 'S' will reflect the
  1114. select request where additional chunks were added, while
  1115. the secondary start record 's' will reflect the assigned
  1116. resources after pruning the original select request via
  1117. the pbs.release_nodes(keep_select=...) call
  1118. inside execjob_launch hook.
  1119. """
  1120. # instantiate queuejob hook
  1121. hook_event = "queuejob"
  1122. hook_name = "qjob"
  1123. a = {'event': hook_event, 'enabled': 'true'}
  1124. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  1125. # instantiate execjob_begin hook
  1126. hook_event = "execjob_begin"
  1127. hook_name = "begin"
  1128. a = {'event': hook_event, 'enabled': 'true'}
  1129. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  1130. # instantiate execjob_prologue hook
  1131. hook_event = "execjob_prologue"
  1132. hook_name = "prolo"
  1133. a = {'event': hook_event, 'enabled': 'true'}
  1134. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  1135. # instantiate execjob_launch hook
  1136. hook_body = """
  1137. import pbs
  1138. e=pbs.event()
  1139. if 'PBS_NODEFILE' not in e.env:
  1140. e.accept()
  1141. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  1142. for vn in e.vnode_list:
  1143. v = e.vnode_list[vn]
  1144. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  1145. for vn in e.vnode_list_fail:
  1146. v = e.vnode_list_fail[vn]
  1147. pbs.logjobmsg(e.job.id, "launch:offline vnode_list_fail[" + v.name + "]")
  1148. v.state = pbs.ND_OFFLINE
  1149. if e.job.in_ms_mom():
  1150. pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
  1151. if pj != None:
  1152. pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
  1153. pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
  1154. pbs.logjobmsg(e.job.id,
  1155. "launch: job.schedselect=%s" % (pj.schedselect,))
  1156. else:
  1157. e.job.Hold_Types = pbs.hold_types("s")
  1158. e.job.rerun()
  1159. e.reject("unsuccessful at LAUNCH")
  1160. """
  1161. hook_event = "execjob_launch"
  1162. hook_name = "launch"
  1163. a = {'event': hook_event, 'enabled': 'true'}
  1164. self.server.create_import_hook(hook_name, a, hook_body)
  1165. # First, turn off scheduling
  1166. a = {'scheduling': 'false'}
  1167. self.server.manager(MGR_CMD_SET, SERVER, a)
  1168. jid = self.create_and_submit_job('job1')
  1169. # Job gets queued and reflects the incremented values from queuejob
  1170. # hook
  1171. self.server.expect(JOB, {'job_state': 'Q',
  1172. 'tolerate_node_failures': 'job_start',
  1173. 'Resource_List.mem': '10gb',
  1174. 'Resource_List.ncpus': 13,
  1175. 'Resource_List.nodect': 5,
  1176. 'Resource_List.select': self.job1_iselect,
  1177. 'Resource_List.site': self.job1_oselect,
  1178. 'Resource_List.place': self.job1_place,
  1179. 'schedselect': self.job1_ischedselect},
  1180. id=jid, attrop=PTL_AND)
  1181. a = {'scheduling': 'true'}
  1182. self.server.manager(MGR_CMD_SET, SERVER, a)
  1183. # Job eventually launches reflecting the pruned back values
  1184. # to the original select spec
  1185. # There's a max_attempts=60 for it would take up to 60 seconds
  1186. # for primary mom to wait for the sisters to join
  1187. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  1188. # sisters to execjob_prologue hooks (default $job_launch_delay
  1189. # value of 30 seconds)
  1190. self.server.expect(JOB, {'job_state': 'R',
  1191. 'tolerate_node_failures': 'job_start',
  1192. 'Resource_List.mem': '6gb',
  1193. 'Resource_List.ncpus': 8,
  1194. 'Resource_List.nodect': 3,
  1195. 'Resource_List.select': self.job1_select,
  1196. 'Resource_List.place': self.job1_place,
  1197. 'schedselect': self.job1_schedselect,
  1198. 'exec_host': self.job1_exec_host,
  1199. 'exec_vnode': self.job1_exec_vnode},
  1200. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  1201. thisjob = self.server.status(JOB, id=jid)
  1202. if thisjob:
  1203. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  1204. # Check various vnode status.
  1205. jobs_assn1 = "%s/0" % (jid,)
  1206. self.match_vnode_status(
  1207. [self.nAv0, self.nAv1, self.nE, self.nEv0],
  1208. 'job-busy', jobs_assn1, 1, '1048576kb')
  1209. self.match_vnode_status([self.nAv2],
  1210. 'job-busy', jobs_assn1, 1, '0kb')
  1211. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  1212. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  1213. 3, '2097152kb')
  1214. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  1215. self.nEv1, self.nEv2, self.nEv3], 'free')
  1216. self.match_vnode_status([self.nB, self.nBv0, self.nBv1, self.nC],
  1217. 'offline')
  1218. # Check server/queue counts.
  1219. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  1220. 'resources_assigned.mem': '6291456kb'},
  1221. attrop=PTL_AND)
  1222. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  1223. 'resources_assigned.mem': '6291456kb'},
  1224. id='workq', attrop=PTL_AND)
  1225. self.assertTrue(
  1226. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1227. # Verify mom_logs
  1228. self.momA.log_match(
  1229. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  1230. jid, self.hostB), n=10, regexp=True)
  1231. self.momA.log_match(
  1232. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  1233. "is tolerant of node failures",
  1234. regexp=True, n=10)
  1235. self.momA.log_match(
  1236. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  1237. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  1238. self.momA.log_match(
  1239. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  1240. "as job is tolerant of node failures", n=10, regexp=True)
  1241. # Check vnode_list[] parameter in execjob_prologue hook
  1242. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  1243. self.nB, self.nBv0, self.nBv1,
  1244. self.nC, self.nD, self.nE, self.nEv0]
  1245. for vn in vnode_list:
  1246. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  1247. jid, vn), n=10)
  1248. # Check vnode_list_fail[] parameter in execjob_prologue hook
  1249. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  1250. for vn in vnode_list_fail:
  1251. self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
  1252. jid, vn), n=10)
  1253. # Check vnode_list[] parameter in execjob_launch hook
  1254. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  1255. self.nB, self.nBv0, self.nBv1,
  1256. self.nC, self.nD, self.nE, self.nEv0]
  1257. for vn in vnode_list:
  1258. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  1259. jid, vn), n=10)
  1260. # Check vnode_list_fail[] parameter in execjob_launch hook
  1261. vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
  1262. for vn in vnode_list_fail:
  1263. self.momA.log_match(
  1264. "Job;%s;launch:offline vnode_list_fail[%s]" % (jid, vn), n=10)
  1265. # Check result of pbs.event().job.release_nodes(keep_select) call
  1266. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  1267. jid, self.job1_exec_vnode), n=10)
  1268. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  1269. jid, self.job1_schedselect), n=10)
  1270. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  1271. jid, self.job1_iexec_vnode), n=10)
  1272. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  1273. jid, self.job1_exec_vnode), n=10)
  1274. # Check accounting_logs
  1275. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  1276. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  1277. self.job1_place,
  1278. self.job1_isel_esc)
  1279. self.match_accounting_log('s', jid, self.job1_exec_host_esc,
  1280. self.job1_exec_vnode_esc,
  1281. "6gb", 8, 3,
  1282. self.job1_place,
  1283. self.job1_sel_esc)
  1284. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  1285. n=10, max_attempts=60, interval=2, regexp=True)
  1286. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  1287. n=10, max_attempts=10, interval=2)
  1288. # validate output
  1289. expected_out = """/var/spool/pbs/aux/%s
  1290. %s
  1291. %s
  1292. %s
  1293. FIB TESTS
  1294. pbsdsh -n 1 fib 37
  1295. %d
  1296. pbsdsh -n 2 fib 37
  1297. %d
  1298. fib 37
  1299. %d
  1300. HOSTNAME TESTS
  1301. pbsdsh -n 0 hostname
  1302. %s
  1303. pbsdsh -n 1 hostname
  1304. %s
  1305. pbsdsh -n 2 hostname
  1306. %s
  1307. PBS_NODEFILE tests
  1308. HOST=%s
  1309. pbs_tmrsh %s hostname
  1310. %s
  1311. HOST=%s
  1312. pbs_tmrsh %s hostname
  1313. %s
  1314. HOST=%s
  1315. pbs_tmrsh %s hostname
  1316. %s
  1317. """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
  1318. self.fib37_value, self.fib37_value, self.fib37_value,
  1319. self.momA.shortname, self.momD.shortname, self.momE.shortname,
  1320. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  1321. self.momD.hostname, self.momD.hostname, self.momD.shortname,
  1322. self.momE.hostname, self.momE.hostname, self.momE.shortname)
  1323. self.logger.info("expected out=%s" % (expected_out,))
  1324. job_out = ""
  1325. with open(job_output_file, 'r') as fd:
  1326. job_out = fd.read()
  1327. self.logger.info("job_out=%s" % (job_out,))
  1328. self.assertEquals(job_out, expected_out)
  1329. @timeout(400)
  1330. def test_t2(self):
  1331. """
  1332. Test tolerating job_start 2 node failures after adding
  1333. extra nodes to the job, pruning
  1334. job's assigned resources to match up to the original
  1335. select spec, without offlining the failed vnodes, and
  1336. specifying mom config file options 'sister_join_job_alarm' and
  1337. 'job_launch_delay'.
  1338. 1. Set $sister_join_job_alarm and $job_launch_delay values
  1339. in mom's config file.
  1340. 2. Submit a job that has been submitted with a select
  1341. spec of 2 super-chunks say (A) and (B), and 1 chunk
  1342. of (C), along with place spec of "scatter",
  1343. resulting in the following assignment:
  1344. exec_vnode = (A)+(B)+(C)
  1345. and -Wtolerate_node_failures=job_start
  1346. 3. Have a queuejob hook that adds 1 extra node to each
  1347. chunk (except the MS (first) chunk), resulting in the
  1348. assignment:
  1349. exec_vnode = (A)+(B)+(D)+(C)+(E)
  1350. where D mirrors super-chunk B specs while E mirrors
  1351. chunk C.
  1352. 4. Prior to submitting a job, suspend mom B. When job runs,
  1353. momB won't be able to join the job, so it won't be considered
  1354. as a "healthy" mom.
  1355. 5. Have an execjob_begin hook that doesn't fail.
  1356. 6. Have an execjob_prologue hook that fails (causes rejection)
  1357. when executed by mom managing vnodes in (C).
  1358. 7. Have an execjob_launch hook that prunes back the
  1359. job's exec_vnode assignment back to satisfying the original
  1360. 3-node select spec, choosing only healthy nodes.
  1361. 8. Result:
  1362. a. This results in the following reassignment of chunks:
  1363. exec_vnode = (A)+(D)+(E)
  1364. since (B) and (C) contain vnodes from failed moms.
  1365. b. vnodes in (B) and (C) are now showing a state of "free".
  1366. c. Mom's log file will show explicit values to
  1367. $sister_join_job_alarm and $job_launch_delay.
  1368. c. The accounting log start record 'S' will reflect the
  1369. select request where additional chunks were added, while
  1370. the secondary start record 's' will reflect the assigned
  1371. resources after pruning the original select request via
  1372. the pbs.release_nodes(keep_select=...) call
  1373. inside execjob_launch hook.
  1374. """
  1375. # set mom config options:
  1376. sis_join_alarm = 45
  1377. c = {'$sister_join_job_alarm': sis_join_alarm}
  1378. self.momA.add_config(c)
  1379. job_launch_delay = 40
  1380. c = {'$job_launch_delay': job_launch_delay}
  1381. self.momA.add_config(c)
  1382. self.momA.signal("-HUP")
  1383. self.momA.log_match(
  1384. "sister_join_job_alarm;%d" % (sis_join_alarm,), max_attempts=5,
  1385. interval=5)
  1386. self.momA.log_match(
  1387. "job_launch_delay;%d" % (job_launch_delay,),
  1388. max_attempts=5, interval=5)
  1389. # instantiate queuejob hook
  1390. hook_event = "queuejob"
  1391. hook_name = "qjob"
  1392. a = {'event': hook_event, 'enabled': 'true'}
  1393. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  1394. # instantiate execjob_begin hook
  1395. hook_event = "execjob_begin"
  1396. hook_name = "begin"
  1397. a = {'event': hook_event, 'enabled': 'true'}
  1398. self.server.create_import_hook(hook_name, a, self.begin_hook_body2)
  1399. # instantiate execjob_prologue hook
  1400. hook_event = "execjob_prologue"
  1401. hook_name = "prolo"
  1402. a = {'event': hook_event, 'enabled': 'true'}
  1403. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  1404. # instantiate execjob_launch hook
  1405. hook_event = "execjob_launch"
  1406. hook_name = "launch"
  1407. a = {'event': hook_event, 'enabled': 'true'}
  1408. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  1409. # First, turn off scheduling
  1410. a = {'scheduling': 'false'}
  1411. self.server.manager(MGR_CMD_SET, SERVER, a)
  1412. # temporarily suspend momB, simulating a failed mom host.
  1413. self.momB.signal("-STOP")
  1414. jid = self.create_and_submit_job('job1')
  1415. # Job gets queued and reflects the incremented values from queuejob
  1416. # hook
  1417. self.server.expect(JOB, {'job_state': 'Q',
  1418. 'tolerate_node_failures': 'job_start',
  1419. 'Resource_List.mem': '10gb',
  1420. 'Resource_List.ncpus': 13,
  1421. 'Resource_List.nodect': 5,
  1422. 'Resource_List.select': self.job1_iselect,
  1423. 'Resource_List.site': self.job1_oselect,
  1424. 'Resource_List.place': self.job1_place,
  1425. 'schedselect': self.job1_ischedselect},
  1426. id=jid, attrop=PTL_AND)
  1427. # Set time to start scanning logs
  1428. stime = int(time.time())
  1429. a = {'scheduling': 'true'}
  1430. self.server.manager(MGR_CMD_SET, SERVER, a)
  1431. # Job eventually launches reflecting the pruned back values
  1432. # to the original select spec
  1433. # There's a max_attempts=60 for it would take up to 60 seconds
  1434. # for primary mom to wait for the sisters to join
  1435. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  1436. # sisters to execjob_prologue hooks (default $job_launch_delay
  1437. # value of 30 seconds)
  1438. self.server.expect(JOB, {'job_state': 'R',
  1439. 'tolerate_node_failures': 'job_start',
  1440. 'Resource_List.mem': '6gb',
  1441. 'Resource_List.ncpus': 8,
  1442. 'Resource_List.nodect': 3,
  1443. 'Resource_List.select': self.job1_select,
  1444. 'Resource_List.place': self.job1_place,
  1445. 'schedselect': self.job1_schedselect,
  1446. 'exec_host': self.job1_exec_host,
  1447. 'exec_vnode': self.job1_exec_vnode},
  1448. id=jid, interval=1, attrop=PTL_AND,
  1449. max_attempts=100)
  1450. thisjob = self.server.status(JOB, id=jid)
  1451. if thisjob:
  1452. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  1453. # Verify the logs and make sure sister_join_job_alarm is honored
  1454. logs = self.mom.log_match(
  1455. "Executing begin",
  1456. allmatch=True, starttime=stime, max_attempts=8)
  1457. log1 = logs[0][1]
  1458. logs = self.mom.log_match(
  1459. "Executing prolo",
  1460. allmatch=True, starttime=stime, max_attempts=8)
  1461. log2 = logs[0][1]
  1462. pattern = '%m/%d/%Y %H:%M:%S'
  1463. tmp = log1.split(';')
  1464. # Convert the time into epoch time
  1465. time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1466. tmp = log2.split(';')
  1467. time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1468. diff = time2 - time1
  1469. self.logger.info(
  1470. "Time diff between begin hook and prologue hook is " +
  1471. str(diff) + " seconds")
  1472. # Leave a little wiggle room for slow systems
  1473. self.assertTrue((diff >= sis_join_alarm) and
  1474. diff <= (sis_join_alarm + 5))
  1475. self.mom.log_match(
  1476. "sister_join_job_alarm wait time %d secs exceeded" % (
  1477. sis_join_alarm,), starttime=stime, max_attempts=8)
  1478. # Verify the logs and make sure job_launch_delay is honored
  1479. logs = self.mom.log_match(
  1480. "Executing prolo",
  1481. allmatch=True, starttime=stime, max_attempts=8)
  1482. log1 = logs[0][1]
  1483. logs = self.mom.log_match(
  1484. "Executing launch",
  1485. allmatch=True, starttime=stime, max_attempts=8)
  1486. log2 = logs[0][1]
  1487. pattern = '%m/%d/%Y %H:%M:%S'
  1488. tmp = log1.split(';')
  1489. # Convert the time into epoch time
  1490. time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1491. tmp = log2.split(';')
  1492. time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1493. diff = time2 - time1
  1494. self.logger.info("Time diff between prolo hook and launch hook is " +
  1495. str(diff) + " seconds")
  1496. # Leave a little wiggle room for slow systems
  1497. self.assertTrue((diff >= job_launch_delay) and
  1498. diff <= (job_launch_delay + 3))
  1499. self.momA.log_match(
  1500. "not all prologue hooks to sister moms completed, " +
  1501. "but job will proceed to execute", n=10)
  1502. # Check various vnode status.
  1503. jobs_assn1 = "%s/0" % (jid,)
  1504. self.match_vnode_status(
  1505. [self.nAv0, self.nAv1, self.nE, self.nEv0],
  1506. 'job-busy', jobs_assn1, 1, '1048576kb')
  1507. self.match_vnode_status([self.nAv2],
  1508. 'job-busy', jobs_assn1, 1, '0kb')
  1509. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  1510. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  1511. 3, '2097152kb')
  1512. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  1513. self.nBv1, self.nBv2, self.nBv3, self.nC,
  1514. self.nEv1, self.nEv2, self.nEv3], 'free')
  1515. self.assertTrue(
  1516. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1517. # Verify mom_logs
  1518. self.momA.log_match(
  1519. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  1520. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  1521. self.momA.log_match(
  1522. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  1523. "as job is tolerant of node failures", n=10, regexp=True)
  1524. # Check vnode_list[] parameter in execjob_prologue hook
  1525. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  1526. self.nB, self.nBv0, self.nBv1,
  1527. self.nC, self.nD, self.nE, self.nEv0]
  1528. for vn in vnode_list:
  1529. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  1530. jid, vn), n=10)
  1531. # check server/queue counts
  1532. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  1533. 'resources_assigned.mem': '6291456kb'},
  1534. attrop=PTL_AND)
  1535. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  1536. 'resources_assigned.mem': '6291456kb'},
  1537. id='workq', attrop=PTL_AND)
  1538. # Check vnode_list[] parameter in execjob_launch hook
  1539. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  1540. self.nB, self.nBv0, self.nBv1,
  1541. self.nC, self.nD, self.nE, self.nEv0]
  1542. for vn in vnode_list:
  1543. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  1544. jid, vn), n=10)
  1545. # Check vnode_list_fail[] parameter in execjob_launch hook
  1546. vnode_list_fail = [self.nC]
  1547. for vn in vnode_list_fail:
  1548. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  1549. jid, vn), n=10)
  1550. # Check result of pbs.event().job.release_nodes(keep_select) call
  1551. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  1552. jid, self.job1_exec_vnode), n=10)
  1553. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  1554. jid, self.job1_schedselect), n=10)
  1555. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  1556. jid, self.job1_iexec_vnode), n=10)
  1557. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  1558. jid, self.job1_exec_vnode), n=10)
  1559. # Check accounting_logs
  1560. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  1561. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  1562. self.job1_place,
  1563. self.job1_isel_esc)
  1564. self.match_accounting_log('s', jid, self.job1_exec_host_esc,
  1565. self.job1_exec_vnode_esc,
  1566. "6gb", 8, 3,
  1567. self.job1_place,
  1568. self.job1_sel_esc)
  1569. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  1570. n=10, max_attempts=60, interval=2, regexp=True)
  1571. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  1572. n=10, max_attempts=10, interval=2)
  1573. # validate output
  1574. expected_out = """/var/spool/pbs/aux/%s
  1575. %s
  1576. %s
  1577. %s
  1578. FIB TESTS
  1579. pbsdsh -n 1 fib 37
  1580. %d
  1581. pbsdsh -n 2 fib 37
  1582. %d
  1583. fib 37
  1584. %d
  1585. HOSTNAME TESTS
  1586. pbsdsh -n 0 hostname
  1587. %s
  1588. pbsdsh -n 1 hostname
  1589. %s
  1590. pbsdsh -n 2 hostname
  1591. %s
  1592. PBS_NODEFILE tests
  1593. HOST=%s
  1594. pbs_tmrsh %s hostname
  1595. %s
  1596. HOST=%s
  1597. pbs_tmrsh %s hostname
  1598. %s
  1599. HOST=%s
  1600. pbs_tmrsh %s hostname
  1601. %s
  1602. """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
  1603. self.fib37_value, self.fib37_value, self.fib37_value,
  1604. self.momA.shortname, self.momD.shortname, self.momE.shortname,
  1605. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  1606. self.momD.hostname, self.momD.hostname, self.momD.shortname,
  1607. self.momE.hostname, self.momE.hostname, self.momE.shortname)
  1608. self.logger.info("expected out=%s" % (expected_out,))
  1609. job_out = ""
  1610. with open(job_output_file, 'r') as fd:
  1611. job_out = fd.read()
  1612. self.logger.info("job_out=%s" % (job_out,))
  1613. self.assertEquals(job_out, expected_out)
  1614. @timeout(400)
  1615. def test_t3(self):
  1616. """
  1617. Test: tolerating job_start 2 node failures after adding
  1618. extra nodes to the job, pruning
  1619. job's assigned resources to match up to the original
  1620. select spec, without offlining the failed vnodes, and
  1621. with 2 execjob_prologue hooks, with prologue hook1
  1622. having alarm1 and prologue hook2 having alarm2.
  1623. This also test the default value to sister_join_job_alarm.
  1624. 1. Submit a job that has been submitted with a select
  1625. spec of 2 super-chunks say (A) and (B), and 1 chunk
  1626. of (C), along with place spec of "scatter",
  1627. resulting in the following assignment:
  1628. exec_vnode = (A)+(B)+(C)
  1629. and -Wtolerate_node_failures=job_start
  1630. 2. Have a queuejob hook that adds 1 extra node to each
  1631. chunk (except the MS (first) chunk), resulting in the
  1632. assignment:
  1633. exec_vnode = (A)+(B)+(D)+(C)+(E)
  1634. where D mirrors super-chunk B specs while E mirrors
  1635. chunk C.
  1636. 3. Prior to submitting a job, suspend mom B. When job runs,
  1637. momB won't be able to join the job, so it won't be considered
  1638. as a "healthy" mom.
  1639. 4. Have an execjob_prologue hook that doesn't fail any mom host
  1640. with alarm=alarm1, order=1.
  1641. 5. Have an execjob_prologue hook2 with alarm=alarm2, order=2,
  1642. that fails (causes rejection) when executed by mom managing
  1643. vnodes in (C).
  1644. 6. Have an execjob_launch hook that prunes back the
  1645. job's exec_vnode assignment back to satisfying the original
  1646. 3-node select spec, choosing only healthy nodes.
  1647. 7. Result:
  1648. a. This results in the following reassignment of chunks:
  1649. exec_vnode = (A)+(D)+(E)
  1650. since (B) and (C) contain vnodes from failed moms.
  1651. b. vnodes in (B) and (C) are now showing a state of "free".
  1652. c. Mom's log file shows the wait time from execjob_prologue
  1653. hook1 execution and the execution of the exescjob_launch
  1654. hook is no more than alarm1+alarm2.
  1655. c. The accounting log start record 'S' will reflect the
  1656. select request where additional chunks were added, while
  1657. the secondary start record 's' will reflect the assigned
  1658. resources after pruning the original select request via
  1659. the pbs.release_nodes(keep_select=...) call
  1660. inside execjob_launch hook.
  1661. """
  1662. # instantiate queuejob hook
  1663. hook_event = "queuejob"
  1664. hook_name = "qjob"
  1665. a = {'event': hook_event, 'enabled': 'true'}
  1666. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  1667. # instantiate execjob_begin hook
  1668. hook_event = "execjob_begin"
  1669. hook_name = "begin"
  1670. a = {'event': hook_event, 'enabled': 'true'}
  1671. self.server.create_import_hook(hook_name, a, self.begin_hook_body2)
  1672. # instantiate execjob_prologue hook #1
  1673. hook_body = """
  1674. import pbs
  1675. e=pbs.event()
  1676. pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo1")
  1677. localnode=pbs.get_local_nodename()
  1678. """
  1679. hook_event = "execjob_prologue"
  1680. hook_name = "prolo1"
  1681. alarm1 = 17
  1682. a = {'event': hook_event, 'enabled': 'true', 'order': 1,
  1683. 'alarm': alarm1}
  1684. self.server.create_import_hook(hook_name, a, hook_body)
  1685. # instantiate execjob_prologue hook #2
  1686. hook_body = """
  1687. import pbs
  1688. e=pbs.event()
  1689. pbs.logmsg(pbs.LOG_DEBUG, "Executing prolo2")
  1690. for vn in e.vnode_list:
  1691. v = e.vnode_list[vn]
  1692. pbs.logjobmsg(e.job.id, "prolo2: found vnode_list[" + v.name + "]")
  1693. for vn in e.vnode_list_fail:
  1694. v = e.vnode_list_fail[vn]
  1695. pbs.logjobmsg(e.job.id, "prolo2: found vnode_list_fail[" + v.name + "]")
  1696. localnode=pbs.get_local_nodename()
  1697. if not e.job.in_ms_mom() and (localnode == '%s'):
  1698. x
  1699. """ % (self.nC,)
  1700. hook_event = "execjob_prologue"
  1701. hook_name = "prolo2"
  1702. alarm2 = 16
  1703. a = {'event': hook_event, 'enabled': 'true', 'order': 2,
  1704. 'alarm': alarm2}
  1705. self.server.create_import_hook(hook_name, a, hook_body)
  1706. # instantiate execjob_launch hook
  1707. hook_event = "execjob_launch"
  1708. hook_name = "launch"
  1709. a = {'event': hook_event, 'enabled': 'true'}
  1710. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  1711. # First, turn off scheduling
  1712. a = {'scheduling': 'false'}
  1713. self.server.manager(MGR_CMD_SET, SERVER, a)
  1714. # temporarily suspend momB, simulating a failed mom host.
  1715. self.momB.signal("-STOP")
  1716. jid = self.create_and_submit_job('job1')
  1717. # Job gets queued and reflects the incremented values from queuejob
  1718. # hook
  1719. self.server.expect(JOB, {'job_state': 'Q',
  1720. 'tolerate_node_failures': 'job_start',
  1721. 'Resource_List.mem': '10gb',
  1722. 'Resource_List.ncpus': 13,
  1723. 'Resource_List.nodect': 5,
  1724. 'Resource_List.select': self.job1_iselect,
  1725. 'Resource_List.site': self.job1_oselect,
  1726. 'Resource_List.place': self.job1_place,
  1727. 'schedselect': self.job1_ischedselect},
  1728. id=jid, attrop=PTL_AND)
  1729. # Set time to start scanning logs
  1730. stime = int(time.time())
  1731. a = {'scheduling': 'true'}
  1732. self.server.manager(MGR_CMD_SET, SERVER, a)
  1733. # Job eventually launches reflecting the pruned back values
  1734. # to the original select spec
  1735. # There's a max_attempts=60 for it would take up to 60 seconds
  1736. # for primary mom to wait for the sisters to join
  1737. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  1738. # sisters to execjob_prologue hooks (default $job_launch_delay
  1739. # value of 30 seconds)
  1740. self.server.expect(JOB, {'job_state': 'R',
  1741. 'tolerate_node_failures': 'job_start',
  1742. 'Resource_List.mem': '6gb',
  1743. 'Resource_List.ncpus': 8,
  1744. 'Resource_List.nodect': 3,
  1745. 'Resource_List.select': self.job1_select,
  1746. 'Resource_List.place': self.job1_place,
  1747. 'schedselect': self.job1_schedselect,
  1748. 'exec_host': self.job1_exec_host,
  1749. 'exec_vnode': self.job1_exec_vnode},
  1750. id=jid, interval=1, attrop=PTL_AND,
  1751. max_attempts=100)
  1752. thisjob = self.server.status(JOB, id=jid)
  1753. if thisjob:
  1754. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  1755. # Verify the logs and make sure sister_join_job_alarm is honored
  1756. logs = self.mom.log_match(
  1757. "Executing begin",
  1758. allmatch=True, starttime=stime, max_attempts=8)
  1759. log1 = logs[0][1]
  1760. logs = self.mom.log_match(
  1761. "Executing prolo1",
  1762. allmatch=True, starttime=stime, max_attempts=8)
  1763. log2 = logs[0][1]
  1764. pattern = '%m/%d/%Y %H:%M:%S'
  1765. tmp = log1.split(';')
  1766. # Convert the time into epoch time
  1767. time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1768. tmp = log2.split(';')
  1769. time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1770. diff = time2 - time1
  1771. self.logger.info(
  1772. "Time diff between begin hook and prologue hook is " +
  1773. str(diff) + " seconds")
  1774. # Leave a little wiggle room for slow systems
  1775. # test default sister_join_job_alarm value
  1776. sis_join_alarm = 30
  1777. self.assertTrue((diff >= sis_join_alarm) and
  1778. diff <= (sis_join_alarm + 5))
  1779. self.mom.log_match(
  1780. "sister_join_job_alarm wait time %d secs exceeded" % (
  1781. sis_join_alarm,), starttime=stime, max_attempts=8)
  1782. # Verify the logs and make sure job_launch_delay is honored
  1783. logs = self.mom.log_match(
  1784. "Executing prolo1",
  1785. allmatch=True, starttime=stime, max_attempts=8)
  1786. log1 = logs[0][1]
  1787. logs = self.mom.log_match(
  1788. "Executing launch",
  1789. allmatch=True, starttime=stime, max_attempts=8)
  1790. log2 = logs[0][1]
  1791. pattern = '%m/%d/%Y %H:%M:%S'
  1792. tmp = log1.split(';')
  1793. # Convert the time into epoch time
  1794. time1 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1795. tmp = log2.split(';')
  1796. time2 = int(time.mktime(time.strptime(tmp[0], pattern)))
  1797. diff = time2 - time1
  1798. self.logger.info(
  1799. "Time diff between prolo1 hook and launch hook is " +
  1800. str(diff) + " seconds")
  1801. # Leave a little wiggle room for slow systems
  1802. job_launch_delay = alarm1 + alarm2
  1803. self.assertTrue((diff >= job_launch_delay) and
  1804. diff <= (job_launch_delay + 3))
  1805. # Check various vnode status.
  1806. jobs_assn1 = "%s/0" % (jid,)
  1807. self.match_vnode_status(
  1808. [self.nAv0, self.nAv1, self.nE, self.nEv0],
  1809. 'job-busy', jobs_assn1, 1, '1048576kb')
  1810. self.match_vnode_status([self.nAv2],
  1811. 'job-busy', jobs_assn1, 1, '0kb')
  1812. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  1813. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  1814. 3, '2097152kb')
  1815. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  1816. self.nBv1, self.nBv2, self.nBv3, self.nC,
  1817. self.nEv1, self.nEv2, self.nEv3], 'free')
  1818. # check server/queue counts
  1819. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  1820. 'resources_assigned.mem': '6291456kb'},
  1821. attrop=PTL_AND)
  1822. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  1823. 'resources_assigned.mem': '6291456kb'},
  1824. id='workq', attrop=PTL_AND)
  1825. self.assertTrue(
  1826. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  1827. # Verify mom_logs
  1828. self.momA.log_match(
  1829. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  1830. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  1831. self.momA.log_match(
  1832. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  1833. "as job is tolerant of node failures", n=10, regexp=True)
  1834. self.momA.log_match(
  1835. "not all prologue hooks to sister moms completed, " +
  1836. "but job will proceed to execute", n=10)
  1837. # Check vnode_list[] parameter in execjob_prologue hook
  1838. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  1839. self.nB, self.nBv0, self.nBv1,
  1840. self.nC, self.nD, self.nE, self.nEv0]
  1841. for vn in vnode_list:
  1842. self.momA.log_match("Job;%s;prolo2: found vnode_list[%s]" % (
  1843. jid, vn), n=10)
  1844. # Check vnode_list[] parameter in execjob_launch hook
  1845. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  1846. self.nB, self.nBv0, self.nBv1,
  1847. self.nC, self.nD, self.nE, self.nEv0]
  1848. for vn in vnode_list:
  1849. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  1850. jid, vn), n=10)
  1851. # Check vnode_list_fail[] parameter in execjob_launch hook
  1852. vnode_list_fail = [self.nC]
  1853. for vn in vnode_list_fail:
  1854. self.momA.log_match(
  1855. "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
  1856. # Check result of pbs.event().job.release_nodes(keep_select) call
  1857. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  1858. jid, self.job1_exec_vnode), n=10)
  1859. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  1860. jid, self.job1_schedselect), n=10)
  1861. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  1862. jid, self.job1_iexec_vnode), n=10)
  1863. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  1864. jid, self.job1_exec_vnode), n=10)
  1865. # Check accounting_logs
  1866. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  1867. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  1868. self.job1_place,
  1869. self.job1_isel_esc)
  1870. self.match_accounting_log('s', jid, self.job1_exec_host_esc,
  1871. self.job1_exec_vnode_esc,
  1872. "6gb", 8, 3,
  1873. self.job1_place,
  1874. self.job1_sel_esc)
  1875. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  1876. n=10, max_attempts=60, interval=2, regexp=True)
  1877. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  1878. n=10, max_attempts=10, interval=2)
  1879. # validate output
  1880. expected_out = """/var/spool/pbs/aux/%s
  1881. %s
  1882. %s
  1883. %s
  1884. FIB TESTS
  1885. pbsdsh -n 1 fib 37
  1886. %d
  1887. pbsdsh -n 2 fib 37
  1888. %d
  1889. fib 37
  1890. %d
  1891. HOSTNAME TESTS
  1892. pbsdsh -n 0 hostname
  1893. %s
  1894. pbsdsh -n 1 hostname
  1895. %s
  1896. pbsdsh -n 2 hostname
  1897. %s
  1898. PBS_NODEFILE tests
  1899. HOST=%s
  1900. pbs_tmrsh %s hostname
  1901. %s
  1902. HOST=%s
  1903. pbs_tmrsh %s hostname
  1904. %s
  1905. HOST=%s
  1906. pbs_tmrsh %s hostname
  1907. %s
  1908. """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
  1909. self.fib37_value, self.fib37_value, self.fib37_value,
  1910. self.momA.shortname, self.momD.shortname, self.momE.shortname,
  1911. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  1912. self.momD.hostname, self.momD.hostname, self.momD.shortname,
  1913. self.momE.hostname, self.momE.hostname, self.momE.shortname)
  1914. self.logger.info("expected out=%s" % (expected_out,))
  1915. job_out = ""
  1916. with open(job_output_file, 'r') as fd:
  1917. job_out = fd.read()
  1918. self.logger.info("job_out=%s" % (job_out,))
  1919. self.assertEquals(job_out, expected_out)
  1920. @timeout(400)
  1921. def test_t4(self):
  1922. """
  1923. Test: tolerating job_start 1 node failure that is used
  1924. to satisfy a multi-chunk request, after adding
  1925. extra nodes to the job, pruning
  1926. job's assigned resources to match up to the original
  1927. select spec.
  1928. 1. Submit a job that has been submitted with a select
  1929. spec of 2 super-chunks say (A) and (B), and 1 chunk
  1930. of (C), along with place spec of "scatter",
  1931. resulting in the following assignment:
  1932. exec_vnode = (A)+(B)+(C)
  1933. and -Wtolerate_node_failures=job_start
  1934. 2. Have a queuejob hook that adds 1 extra node to each
  1935. chunk (except the MS (first) chunk), resulting in the
  1936. assignment:
  1937. exec_vnode = (A)+(B)+(D)+(C)+(E)
  1938. where D mirrors super-chunk B specs while E mirrors
  1939. chunk C.
  1940. 3. Have an execjob_begin hook that fails (causes rejection)
  1941. when executed by mom managing vnodes in (B).
  1942. 4. Then create an execjob_launch hook that
  1943. prunes back the job's exec_vnode assignment back to
  1944. satisfying the original 3-node select spec,
  1945. choosing only healthy nodes.
  1946. 5. Result:
  1947. a. This results in the following reassignment of chunks:
  1948. exec_vnode = (A)+(D)+(C)
  1949. since (B) contain vnodes from failed moms.
  1950. b. The accounting log start record 'S' will reflect the
  1951. select request where additional chunks were added, while
  1952. the secondary start record 's' will reflect the assigned
  1953. resources after pruning the original select request via
  1954. the pbs.release_nodes(keep_select=...) call
  1955. inside execjob_launch hook.
  1956. """
  1957. # instantiate queuejob hook
  1958. hook_event = "queuejob"
  1959. hook_name = "qjob"
  1960. a = {'event': hook_event, 'enabled': 'true'}
  1961. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  1962. # instantiate execjob_begin hook
  1963. hook_event = "execjob_begin"
  1964. hook_name = "begin"
  1965. a = {'event': hook_event, 'enabled': 'true'}
  1966. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  1967. # instantiate execjob_launch hook
  1968. hook_event = "execjob_launch"
  1969. hook_name = "launch"
  1970. a = {'event': hook_event, 'enabled': 'true'}
  1971. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  1972. # First, turn off scheduling
  1973. a = {'scheduling': 'false'}
  1974. self.server.manager(MGR_CMD_SET, SERVER, a)
  1975. jid = self.create_and_submit_job('job1')
  1976. # Job gets queued and reflects the incremented values from queuejob
  1977. # hook
  1978. self.server.expect(JOB, {'job_state': 'Q',
  1979. 'tolerate_node_failures': 'job_start',
  1980. 'Resource_List.mem': '10gb',
  1981. 'Resource_List.ncpus': 13,
  1982. 'Resource_List.nodect': 5,
  1983. 'Resource_List.select': self.job1_iselect,
  1984. 'Resource_List.site': self.job1_oselect,
  1985. 'Resource_List.place': self.job1_place,
  1986. 'schedselect': self.job1_ischedselect},
  1987. id=jid, attrop=PTL_AND)
  1988. a = {'scheduling': 'true'}
  1989. self.server.manager(MGR_CMD_SET, SERVER, a)
  1990. # Job eventually launches reflecting the pruned back values
  1991. # to the original select spec
  1992. # There's a max_attempts=60 for it would take up to 60 seconds
  1993. # for primary mom to wait for the sisters to join
  1994. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  1995. # sisters to execjob_prologue hooks (default $job_launch_delay
  1996. # value of 30 seconds)
  1997. self.server.expect(JOB, {'job_state': 'R',
  1998. 'tolerate_node_failures': 'job_start',
  1999. 'Resource_List.mem': '6gb',
  2000. 'Resource_List.ncpus': 8,
  2001. 'Resource_List.nodect': 3,
  2002. 'Resource_List.select': self.job1v2_select,
  2003. 'Resource_List.place': self.job1_place,
  2004. 'schedselect': self.job1v2_schedselect,
  2005. 'exec_host': self.job1v2_exec_host,
  2006. 'exec_vnode': self.job1v2_exec_vnode},
  2007. id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
  2008. thisjob = self.server.status(JOB, id=jid)
  2009. if thisjob:
  2010. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  2011. # Check various vnode status.
  2012. jobs_assn1 = "%s/0" % (jid,)
  2013. self.match_vnode_status([self.nAv0, self.nAv1],
  2014. 'job-busy', jobs_assn1, 1, '1048576kb')
  2015. self.match_vnode_status([self.nAv2],
  2016. 'job-busy', jobs_assn1, 1, '0kb')
  2017. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2018. self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
  2019. 2, '2097152kb')
  2020. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  2021. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  2022. 3, '2097152kb')
  2023. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  2024. self.nBv1, self.nBv2, self.nBv3, self.nE,
  2025. self.nEv0, self.nEv1, self.nEv2,
  2026. self.nEv3], 'free')
  2027. # check server/queue counts
  2028. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2029. 'resources_assigned.mem': '6291456kb'},
  2030. attrop=PTL_AND)
  2031. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2032. 'resources_assigned.mem': '6291456kb'},
  2033. id='workq', attrop=PTL_AND)
  2034. self.assertTrue(
  2035. self.pbs_nodefile_match_exec_host(jid, self.job1v2_exec_host))
  2036. # Verify mom_logs
  2037. self.momA.log_match(
  2038. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  2039. jid, self.hostB), n=10, regexp=True)
  2040. self.momA.log_match(
  2041. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  2042. "is tolerant of node failures",
  2043. regexp=True, n=10)
  2044. # Check vnode_list[] parameter in execjob_launch hook
  2045. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  2046. self.nB, self.nBv0, self.nBv1,
  2047. self.nC, self.nD, self.nE, self.nEv0]
  2048. for vn in vnode_list:
  2049. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  2050. jid, vn), n=10)
  2051. # Check vnode_list_fail[] parameter in execjob_launch hook
  2052. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  2053. for vn in vnode_list_fail:
  2054. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  2055. jid, vn), n=10)
  2056. # Check result of pbs.event().job.release_nodes(keep_select) call
  2057. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  2058. jid, self.job1v2_exec_vnode), n=10)
  2059. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  2060. jid, self.job1v2_schedselect), n=10)
  2061. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  2062. jid, self.job1_iexec_vnode), n=10)
  2063. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  2064. jid, self.job1v2_exec_vnode), n=10)
  2065. # Check accounting_logs
  2066. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  2067. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  2068. self.job1_place,
  2069. self.job1_isel_esc)
  2070. self.match_accounting_log('s', jid, self.job1v2_exec_host_esc,
  2071. self.job1v2_exec_vnode_esc,
  2072. "6gb", 8, 3,
  2073. self.job1_place,
  2074. self.job1v2_sel_esc)
  2075. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  2076. n=10, max_attempts=60, interval=2, regexp=True)
  2077. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  2078. n=10, max_attempts=10, interval=2)
  2079. # validate output
  2080. expected_out = """/var/spool/pbs/aux/%s
  2081. %s
  2082. %s
  2083. %s
  2084. FIB TESTS
  2085. pbsdsh -n 1 fib 37
  2086. %d
  2087. pbsdsh -n 2 fib 37
  2088. %d
  2089. fib 37
  2090. %d
  2091. HOSTNAME TESTS
  2092. pbsdsh -n 0 hostname
  2093. %s
  2094. pbsdsh -n 1 hostname
  2095. %s
  2096. pbsdsh -n 2 hostname
  2097. %s
  2098. PBS_NODEFILE tests
  2099. HOST=%s
  2100. pbs_tmrsh %s hostname
  2101. %s
  2102. HOST=%s
  2103. pbs_tmrsh %s hostname
  2104. %s
  2105. HOST=%s
  2106. pbs_tmrsh %s hostname
  2107. %s
  2108. """ % (jid, self.momA.hostname, self.momD.hostname, self.momC.hostname,
  2109. self.fib37_value, self.fib37_value, self.fib37_value,
  2110. self.momA.shortname, self.momD.shortname, self.momC.shortname,
  2111. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  2112. self.momD.hostname, self.momD.hostname, self.momD.shortname,
  2113. self.momC.hostname, self.momC.hostname, self.momC.shortname)
  2114. self.logger.info("expected out=%s" % (expected_out,))
  2115. job_out = ""
  2116. with open(job_output_file, 'r') as fd:
  2117. job_out = fd.read()
  2118. self.logger.info("job_out=%s" % (job_out,))
  2119. self.assertEquals(job_out, expected_out)
  2120. @timeout(400)
  2121. def test_t5(self):
  2122. """
  2123. Test: tolerating job_start 1 node failure used in a regular
  2124. chunk after adding extra nodes to the job, pruning
  2125. job's assigned resources to match up to the original
  2126. select spec.
  2127. 1. Submit a job that has been submitted with a select
  2128. spec of 2 super-chunks say (A) and (B), and 1 chunk
  2129. of (C), along with place spec of "scatter",
  2130. resulting in the following assignment:
  2131. exec_vnode = (A)+(B)+(C)
  2132. and -Wtolerate_node_failures=job_start
  2133. 2. Have a queuejob hook that adds 1 extra node to each
  2134. chunk (except the MS (first) chunk), resulting in the
  2135. assignment:
  2136. exec_vnode = (A)+(B)+(D)+(C)+(E)
  2137. where D mirrors super-chunk B specs while E mirrors
  2138. chunk C.
  2139. 3. Have an execjob_prologue hook that fails (causes
  2140. rejection) when executed by mom managing vnodes in (C).
  2141. 4. Then create an execjob_launch hook that
  2142. prunes back the job's exec_vnode assignment back to
  2143. satisfying the original 3-node select spec,
  2144. choosing only healthy nodes.
  2145. 5. Result:
  2146. a. This results in the following reassignment of chunks:
  2147. exec_vnode = (A)+(B)+(E)
  2148. since (C) contain vnodes from failed moms.
  2149. b. The accounting log start record 'S' will reflect the
  2150. select request where additional chunks were added, while
  2151. the secondary start record 's' will reflect the assigned
  2152. resources after pruning the original select request via
  2153. the pbs.release_nodes(keep_select=...) call
  2154. inside execjob_launch hook.
  2155. """
  2156. # instantiate queuejob hook
  2157. hook_event = "queuejob"
  2158. hook_name = "qjob"
  2159. a = {'event': hook_event, 'enabled': 'true'}
  2160. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  2161. # instantiate execjob_prologue hook
  2162. hook_event = "execjob_prologue"
  2163. hook_name = "prolo"
  2164. a = {'event': hook_event, 'enabled': 'true'}
  2165. self.server.create_import_hook(hook_name, a, self.prolo_hook_body2)
  2166. # instantiate execjob_launch hook
  2167. hook_event = "execjob_launch"
  2168. hook_name = "launch"
  2169. a = {'event': hook_event, 'enabled': 'true'}
  2170. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  2171. # First, turn off scheduling
  2172. a = {'scheduling': 'false'}
  2173. self.server.manager(MGR_CMD_SET, SERVER, a)
  2174. jid = self.create_and_submit_job('job1')
  2175. # Job gets queued and reflects the incremented values from queuejob
  2176. # hook
  2177. self.server.expect(JOB, {'job_state': 'Q',
  2178. 'tolerate_node_failures': 'job_start',
  2179. 'Resource_List.mem': '10gb',
  2180. 'Resource_List.ncpus': 13,
  2181. 'Resource_List.nodect': 5,
  2182. 'Resource_List.select': self.job1_iselect,
  2183. 'Resource_List.site': self.job1_oselect,
  2184. 'Resource_List.place': self.job1_place,
  2185. 'schedselect': self.job1_ischedselect},
  2186. id=jid, attrop=PTL_AND)
  2187. a = {'scheduling': 'true'}
  2188. self.server.manager(MGR_CMD_SET, SERVER, a)
  2189. # Job eventually launches reflecting the pruned back values
  2190. # to the original select spec
  2191. # There's a max_attempts=60 for it would take up to 60 seconds
  2192. # for primary mom to wait for the sisters to join
  2193. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  2194. # sisters to execjob_prologue hooks (default $job_launch_delay
  2195. # value of 30 seconds)
  2196. self.server.expect(JOB, {'job_state': 'R',
  2197. 'tolerate_node_failures': 'job_start',
  2198. 'Resource_List.mem': '6gb',
  2199. 'Resource_List.ncpus': 8,
  2200. 'Resource_List.nodect': 3,
  2201. 'Resource_List.select': self.job1v3_select,
  2202. 'Resource_List.place': self.job1_place,
  2203. 'schedselect': self.job1v3_schedselect,
  2204. 'exec_host': self.job1v3_exec_host,
  2205. 'exec_vnode': self.job1v3_exec_vnode},
  2206. id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
  2207. thisjob = self.server.status(JOB, id=jid)
  2208. if thisjob:
  2209. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  2210. # Check various vnode status.
  2211. jobs_assn1 = "%s/0" % (jid,)
  2212. self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0,
  2213. self.nE, self.nEv0], 'job-busy', jobs_assn1,
  2214. 1, '1048576kb')
  2215. self.match_vnode_status([self.nAv2, self.nBv1],
  2216. 'job-busy', jobs_assn1, 1, '0kb')
  2217. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  2218. self.nC, self.nD, self.nEv1, self.nEv2,
  2219. self.nEv3], 'free')
  2220. # check server/queue counts
  2221. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2222. 'resources_assigned.mem': '6291456kb'},
  2223. attrop=PTL_AND)
  2224. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2225. 'resources_assigned.mem': '6291456kb'},
  2226. id='workq', attrop=PTL_AND)
  2227. self.assertTrue(
  2228. self.pbs_nodefile_match_exec_host(jid, self.job1v3_exec_host))
  2229. # Verify mom_logs
  2230. self.momA.log_match(
  2231. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  2232. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  2233. self.momA.log_match(
  2234. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  2235. "as job is tolerant of node failures", n=10, regexp=True)
  2236. # Check vnode_list[] parameter in execjob_launch hook
  2237. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  2238. self.nB, self.nBv0, self.nBv1,
  2239. self.nC, self.nD, self.nE, self.nEv0]
  2240. for vn in vnode_list:
  2241. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  2242. jid, vn), n=10)
  2243. # Check vnode_list_fail[] parameter in execjob_launch hook
  2244. vnode_list_fail = [self.nC]
  2245. for vn in vnode_list_fail:
  2246. self.momA.log_match(
  2247. "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
  2248. # Check result of pbs.event().job.release_nodes(keep_select) call
  2249. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  2250. jid, self.job1v3_exec_vnode), n=10)
  2251. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  2252. jid, self.job1v3_schedselect), n=10)
  2253. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  2254. jid, self.job1_iexec_vnode), n=10)
  2255. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  2256. jid, self.job1v3_exec_vnode), n=10)
  2257. # Check accounting_logs
  2258. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  2259. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  2260. self.job1_place,
  2261. self.job1_isel_esc)
  2262. self.match_accounting_log('s', jid, self.job1v3_exec_host_esc,
  2263. self.job1v3_exec_vnode_esc,
  2264. "6gb", 8, 3,
  2265. self.job1_place,
  2266. self.job1v3_sel_esc)
  2267. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  2268. n=10, max_attempts=60, interval=2, regexp=True)
  2269. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  2270. n=10, max_attempts=10, interval=2)
  2271. # validate output
  2272. expected_out = """/var/spool/pbs/aux/%s
  2273. %s
  2274. %s
  2275. %s
  2276. FIB TESTS
  2277. pbsdsh -n 1 fib 37
  2278. %d
  2279. pbsdsh -n 2 fib 37
  2280. %d
  2281. fib 37
  2282. %d
  2283. HOSTNAME TESTS
  2284. pbsdsh -n 0 hostname
  2285. %s
  2286. pbsdsh -n 1 hostname
  2287. %s
  2288. pbsdsh -n 2 hostname
  2289. %s
  2290. PBS_NODEFILE tests
  2291. HOST=%s
  2292. pbs_tmrsh %s hostname
  2293. %s
  2294. HOST=%s
  2295. pbs_tmrsh %s hostname
  2296. %s
  2297. HOST=%s
  2298. pbs_tmrsh %s hostname
  2299. %s
  2300. """ % (jid, self.momA.hostname, self.momB.hostname, self.momE.hostname,
  2301. self.fib37_value, self.fib37_value, self.fib37_value,
  2302. self.momA.shortname, self.momB.shortname, self.momE.shortname,
  2303. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  2304. self.momB.hostname, self.momB.hostname, self.momB.shortname,
  2305. self.momE.hostname, self.momE.hostname, self.momE.shortname)
  2306. self.logger.info("expected out=%s" % (expected_out,))
  2307. job_out = ""
  2308. with open(job_output_file, 'r') as fd:
  2309. job_out = fd.read()
  2310. self.logger.info("job_out=%s" % (job_out,))
  2311. self.assertEquals(job_out, expected_out)
  2312. def test_t6(self):
  2313. """
  2314. Test: tolerating job_start of 2 node failures used to
  2315. satisfy the smaller chunks, after adding extra nodes
  2316. to the job, pruning job's assigned resources to match up
  2317. to the original select spec.
  2318. 1. Submit a job that has been submitted with a select
  2319. spec of 2 super-chunks say (A) and (B), and 1 chunk
  2320. of (C), along with place spec of "scatter",
  2321. resulting in the following assignment:
  2322. exec_vnode = (A)+(B)+(C)
  2323. and -Wtolerate_node_failures=job_start
  2324. 2. Have a queuejob hook that adds 1 extra node to each
  2325. chunk (except the MS (first) chunk), resulting in the
  2326. assignment:
  2327. exec_vnode = (A)+(B)+(D)+(C)+(E)
  2328. where D mirrors super-chunk B specs while E mirrors
  2329. chunk C. (C) and (E) are of smaller chunks than (B)
  2330. and (D). For example:
  2331. (D) = "(nadal:ncpus=3:mem=2097152kb)"
  2332. (C) = "(lendl:ncpus=2:mem=2097152kb)"
  2333. 3. Have an execjob_begin hook that fails (causes
  2334. rejection) when executed by mom managing vnodes in (C).
  2335. 4. Have an execjob_prologue hook that fails (causes
  2336. rejection) when executed by mom managing vnodes in (E).
  2337. 5. Then create an execjob_launch hook that
  2338. prunes back the job's exec_vnode assignment back to
  2339. satisfying the original 3-node select spec,
  2340. choosing only healthy nodes.
  2341. 6. Result:
  2342. a. This results in the following reassignment of chunks:
  2343. exec_vnode = (A)+(B)+(D)
  2344. since (C) and (E) contain vnodes from failed moms.
  2345. Note that from (D), only allocate enough resources
  2346. to satisfy the smaller third requested chunk.
  2347. if (D) originally has "(nadal:ncpus=3:mem=2097152kb)",
  2348. reassigning this would only be
  2349. "(nadal:ncpus=2:mem=2097152kb)".
  2350. b. The accounting log start record 'S' will reflect the
  2351. select request where additional chunks were added, while
  2352. the secondary start record 's' will reflect the assigned
  2353. resources after pruning the original select request via
  2354. the pbs.release_nodes(keep_select=...) call
  2355. inside execjob_launch hook.
  2356. """
  2357. # instantiate queuejob hook
  2358. hook_event = "queuejob"
  2359. hook_name = "qjob"
  2360. a = {'event': hook_event, 'enabled': 'true'}
  2361. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  2362. # instantiate execjob_begin hook
  2363. hook_event = "execjob_begin"
  2364. hook_name = "begin"
  2365. a = {'event': hook_event, 'enabled': 'true'}
  2366. self.server.create_import_hook(hook_name, a, self.begin_hook_body3)
  2367. # instantiate execjob_prologue hook
  2368. hook_event = "execjob_prologue"
  2369. hook_name = "prolo"
  2370. a = {'event': hook_event, 'enabled': 'true'}
  2371. self.server.create_import_hook(hook_name, a, self.prolo_hook_body2)
  2372. # instantiate execjob_launch hook
  2373. hook_event = "execjob_launch"
  2374. hook_name = "launch"
  2375. a = {'event': hook_event, 'enabled': 'true'}
  2376. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  2377. # First, turn off scheduling
  2378. a = {'scheduling': 'false'}
  2379. self.server.manager(MGR_CMD_SET, SERVER, a)
  2380. jid = self.create_and_submit_job('job1')
  2381. # Job gets queued and reflects the incremented values from queuejob
  2382. # hook
  2383. self.server.expect(JOB, {'job_state': 'Q',
  2384. 'tolerate_node_failures': 'job_start',
  2385. 'Resource_List.mem': '10gb',
  2386. 'Resource_List.ncpus': 13,
  2387. 'Resource_List.nodect': 5,
  2388. 'Resource_List.select': self.job1_iselect,
  2389. 'Resource_List.site': self.job1_oselect,
  2390. 'Resource_List.place': self.job1_place,
  2391. 'schedselect': self.job1_ischedselect},
  2392. id=jid, attrop=PTL_AND)
  2393. a = {'scheduling': 'true'}
  2394. self.server.manager(MGR_CMD_SET, SERVER, a)
  2395. # Job eventually launches reflecting the pruned back values
  2396. # to the original select spec
  2397. # There's a max_attempts=60 for it would take up to 60 seconds
  2398. # for primary mom to wait for the sisters to join
  2399. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  2400. # sisters to execjob_prologue hooks (default $job_launch_delay
  2401. # value of 30 seconds)
  2402. self.server.expect(JOB, {'job_state': 'R',
  2403. 'tolerate_node_failures': 'job_start',
  2404. 'Resource_List.mem': '6gb',
  2405. 'Resource_List.ncpus': 8,
  2406. 'Resource_List.nodect': 3,
  2407. 'Resource_List.select': self.job1v4_select,
  2408. 'Resource_List.place': self.job1_place,
  2409. 'schedselect': self.job1v4_schedselect,
  2410. 'exec_host': self.job1v4_exec_host,
  2411. 'exec_vnode': self.job1v4_exec_vnode},
  2412. id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
  2413. thisjob = self.server.status(JOB, id=jid)
  2414. if thisjob:
  2415. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  2416. # Check various vnode status.
  2417. jobs_assn1 = "%s/0" % (jid,)
  2418. self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0],
  2419. 'job-busy', jobs_assn1, 1, '1048576kb')
  2420. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2421. self.match_vnode_status([self.nD], 'free', jobs_assn2,
  2422. 2, '2097152kb')
  2423. self.match_vnode_status([self.nAv2, self.nBv1],
  2424. 'job-busy', jobs_assn1, 1, '0kb')
  2425. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  2426. self.nC, self.nD, self.nEv1, self.nEv2,
  2427. self.nEv3], 'free')
  2428. # check server/queue counts
  2429. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2430. 'resources_assigned.mem': '6291456kb'},
  2431. attrop=PTL_AND)
  2432. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2433. 'resources_assigned.mem': '6291456kb'},
  2434. id='workq', attrop=PTL_AND)
  2435. self.assertTrue(
  2436. self.pbs_nodefile_match_exec_host(jid, self.job1v4_exec_host))
  2437. # Verify mom_logs
  2438. self.momA.log_match(
  2439. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  2440. jid, self.hostE), n=10, regexp=True)
  2441. self.momA.log_match(
  2442. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostE) +
  2443. "is tolerant of node failures",
  2444. regexp=True, n=10)
  2445. self.momA.log_match(
  2446. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  2447. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  2448. self.momA.log_match(
  2449. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  2450. "as job is tolerant of node failures", n=10, regexp=True)
  2451. # Check vnode_list[] parameter in execjob_launch hook
  2452. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  2453. self.nB, self.nBv0, self.nBv1,
  2454. self.nC, self.nD, self.nE, self.nEv0]
  2455. for vn in vnode_list:
  2456. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  2457. jid, vn), n=10)
  2458. # Check vnode_list_fail[] parameter in execjob_launch hook
  2459. vnode_list_fail = [self.nC, self.nE]
  2460. for vn in vnode_list_fail:
  2461. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  2462. jid, vn), n=10)
  2463. # Check result of pbs.event().job.release_nodes(keep_select) call
  2464. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  2465. jid, self.job1v4_exec_vnode), n=10)
  2466. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  2467. jid, self.job1v4_schedselect), n=10)
  2468. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  2469. jid, self.job1_iexec_vnode), n=10)
  2470. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  2471. jid, self.job1v4_exec_vnode), n=10)
  2472. # Check accounting_logs
  2473. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  2474. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  2475. self.job1_place,
  2476. self.job1_isel_esc)
  2477. self.match_accounting_log('s', jid, self.job1v4_exec_host_esc,
  2478. self.job1v4_exec_vnode_esc,
  2479. "6gb", 8, 3,
  2480. self.job1_place,
  2481. self.job1v4_sel_esc)
  2482. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  2483. n=10, max_attempts=60, interval=2, regexp=True)
  2484. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  2485. n=10, max_attempts=10, interval=2)
  2486. # validate output
  2487. expected_out = """/var/spool/pbs/aux/%s
  2488. %s
  2489. %s
  2490. %s
  2491. FIB TESTS
  2492. pbsdsh -n 1 fib 37
  2493. %d
  2494. pbsdsh -n 2 fib 37
  2495. %d
  2496. fib 37
  2497. %d
  2498. HOSTNAME TESTS
  2499. pbsdsh -n 0 hostname
  2500. %s
  2501. pbsdsh -n 1 hostname
  2502. %s
  2503. pbsdsh -n 2 hostname
  2504. %s
  2505. PBS_NODEFILE tests
  2506. HOST=%s
  2507. pbs_tmrsh %s hostname
  2508. %s
  2509. HOST=%s
  2510. pbs_tmrsh %s hostname
  2511. %s
  2512. HOST=%s
  2513. pbs_tmrsh %s hostname
  2514. %s
  2515. """ % (jid, self.momA.hostname, self.momB.hostname, self.momD.hostname,
  2516. self.fib37_value, self.fib37_value, self.fib37_value,
  2517. self.momA.shortname, self.momB.shortname, self.momD.shortname,
  2518. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  2519. self.momB.hostname, self.momB.hostname, self.momB.shortname,
  2520. self.momD.hostname, self.momD.hostname, self.momD.shortname)
  2521. self.logger.info("expected out=%s" % (expected_out,))
  2522. job_out = ""
  2523. with open(job_output_file, 'r') as fd:
  2524. job_out = fd.read()
  2525. self.logger.info("job_out=%s" % (job_out,))
  2526. self.assertEquals(job_out, expected_out)
  2527. def test_t7(self):
  2528. """
  2529. Test: tolerating job_start of 2 node failures used to
  2530. satisfy the larger chunks, after adding extra nodes
  2531. to the job. Pruning job's assigned resources to match up
  2532. to the original select spec would fail, as the
  2533. unsatisfied chunk requests cannot be handled by
  2534. by the remaining smaller sized nodes. The failure
  2535. to prune job is followed by a pbs.event().rerun()
  2536. action and a job hold. Also, this test
  2537. setting tolerate_node_falures=none.
  2538. """
  2539. # instantiate queuejob hook
  2540. hook_event = "queuejob"
  2541. hook_name = "qjob"
  2542. a = {'event': hook_event, 'enabled': 'true'}
  2543. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  2544. # instantiate execjob_begin hook
  2545. hook_event = "execjob_begin"
  2546. hook_name = "begin"
  2547. a = {'event': hook_event, 'enabled': 'true'}
  2548. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  2549. # instantiate execjob_prologue hook
  2550. hook_body = """
  2551. import pbs
  2552. e=pbs.event()
  2553. pbs.logmsg(pbs.LOG_DEBUG, "Executing prologue")
  2554. localnode=pbs.get_local_nodename()
  2555. if not e.job.in_ms_mom() and (localnode == '%s'):
  2556. x
  2557. """ % (self.nD,)
  2558. hook_event = "execjob_prologue"
  2559. hook_name = "prolo"
  2560. a = {'event': hook_event, 'enabled': 'true'}
  2561. self.server.create_import_hook(hook_name, a, hook_body)
  2562. # instantiate execjob_launch hook
  2563. hook_event = "execjob_launch"
  2564. hook_name = "launch"
  2565. a = {'event': hook_event, 'enabled': 'true'}
  2566. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  2567. # First, turn off scheduling
  2568. a = {'scheduling': 'false'}
  2569. self.server.manager(MGR_CMD_SET, SERVER, a)
  2570. jid = self.create_and_submit_job('job1')
  2571. # Job gets queued and reflects the incremented values from queuejob
  2572. # hook
  2573. self.server.expect(JOB, {'job_state': 'Q',
  2574. 'tolerate_node_failures': 'job_start',
  2575. 'Resource_List.mem': '10gb',
  2576. 'Resource_List.ncpus': 13,
  2577. 'Resource_List.nodect': 5,
  2578. 'Resource_List.select': self.job1_iselect,
  2579. 'Resource_List.site': self.job1_oselect,
  2580. 'Resource_List.place': self.job1_place,
  2581. 'schedselect': self.job1_ischedselect},
  2582. id=jid, attrop=PTL_AND)
  2583. a = {'scheduling': 'true'}
  2584. self.server.manager(MGR_CMD_SET, SERVER, a)
  2585. # Job eventually launches reflecting the pruned back values
  2586. # to the original select spec
  2587. # There's a max_attempts=60 for it would take up to 60 seconds
  2588. # for primary mom to wait for the sisters to join
  2589. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  2590. # sisters to execjob_prologue hooks (default $job_launch_delay
  2591. # value of 30 seconds)
  2592. # Verify mom_logs
  2593. self.momA.log_match(
  2594. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  2595. jid, self.hostB), n=10, regexp=True)
  2596. self.momA.log_match(
  2597. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  2598. "is tolerant of node failures",
  2599. regexp=True, n=10)
  2600. self.momA.log_match(
  2601. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostD) +
  2602. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  2603. self.momA.log_match(
  2604. "Job;%s;ignoring error from %s.+" % (jid, self.hostD) +
  2605. "as job is tolerant of node failures", n=10, regexp=True)
  2606. self.momA.log_match("Job;%s;could not satisfy select chunk" % (jid,),
  2607. n=10)
  2608. self.momA.log_match("Job;%s;NEED chunks for keep_select" % (jid,),
  2609. n=10)
  2610. self.momA.log_match(
  2611. "Job;%s;HAVE chunks from job's exec_vnode" % (jid,), n=10)
  2612. self.momA.log_match("execjob_launch request rejected by 'launch'",
  2613. n=10)
  2614. errmsg = "unsuccessful at LAUNCH"
  2615. self.momA.log_match("Job;%s;%s" % (jid, errmsg,), n=10)
  2616. self.server.expect(JOB, {'job_state': 'H'},
  2617. id=jid, interval=1, max_attempts=70)
  2618. # turn off queuejob
  2619. self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'false'}, 'qjob')
  2620. # modify job so as to not tolerate_node_failures
  2621. a = {ATTR_tolerate_node_failures: "none"}
  2622. self.server.alterjob(jobid=jid, attrib=a)
  2623. # release hold on job
  2624. self.server.rlsjob(jobid=jid, holdtype='s')
  2625. a = {'scheduling': 'true'}
  2626. self.server.manager(MGR_CMD_SET, SERVER, a)
  2627. # Verify mom_logs
  2628. self.momA.log_match(
  2629. "Job;%s;job_start_error.+could not JOIN_JOB" % (
  2630. jid), n=10, regexp=True)
  2631. self.momA.log_match(
  2632. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostE) +
  2633. "is tolerant of node failures",
  2634. regexp=True, n=10, existence=False, max_attempts=10)
  2635. self.server.expect(JOB, {'job_state': 'H'},
  2636. id=jid, interval=1, max_attempts=15)
  2637. # turn off begin hook, leaving prologue hook in place
  2638. self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'false'}, 'begin')
  2639. # release hold on job
  2640. self.server.rlsjob(jobid=jid, holdtype='s')
  2641. a = {'scheduling': 'true'}
  2642. self.server.manager(MGR_CMD_SET, SERVER, a)
  2643. self.momA.log_match(
  2644. "Job;%s;job_start_error.+could not IM_EXEC_PROLOGUE" % (jid,),
  2645. n=10, regexp=True)
  2646. self.momA.log_match(
  2647. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  2648. "as job is tolerant of node failures", n=10, regexp=True,
  2649. existence=False, max_attempts=10)
  2650. self.server.expect(JOB, {'job_state': 'H'},
  2651. id=jid, interval=1, max_attempts=15)
  2652. # turn off prologue hook, so only launch hook remains.
  2653. self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'false'}, 'prolo')
  2654. # release hold on job
  2655. self.server.rlsjob(jobid=jid, holdtype='s')
  2656. a = {'scheduling': 'true'}
  2657. self.server.manager(MGR_CMD_SET, SERVER, a)
  2658. self.server.expect(JOB, {'job_state': 'R',
  2659. 'tolerate_node_failures': 'none',
  2660. 'Resource_List.mem': '10gb',
  2661. 'Resource_List.ncpus': 13,
  2662. 'Resource_List.nodect': 5,
  2663. 'Resource_List.select': self.job1_iselect,
  2664. 'Resource_List.site': self.job1_oselect,
  2665. 'Resource_List.place': self.job1_place,
  2666. 'exec_host': self.job1_iexec_host,
  2667. 'exec_vnode': self.job1_iexec_vnode,
  2668. 'schedselect': self.job1_ischedselect},
  2669. id=jid, attrop=PTL_AND)
  2670. # tolerate_node_failures=none and launch hook calls release_nodes()
  2671. emsg = "no nodes released as job does not tolerate node failures"
  2672. self.momA.log_match("%s: %s" % (jid, emsg), n=30)
  2673. def test_t8(self):
  2674. """
  2675. Test tolerating node failures at job startup with no
  2676. failed moms.
  2677. 1. Submit a job that has been submitted with a select
  2678. spec of 2 super-chunks say (A) and (B), and 1 chunk
  2679. of (C), along with place spec of "scatter",
  2680. resulting in the following assignment:
  2681. exec_vnode = (A)+(B)+(C)
  2682. and -Wtolerate_node_failures=all
  2683. 2. Have a queuejob hook that adds 1 extra node to each
  2684. chunk (except the MS (first) chunk), resulting in the
  2685. assignment:
  2686. exec_vnode = (A)+(B)+(D)+(C)+(E)
  2687. where D mirrors super-chunk B specs while E mirrors
  2688. chunk C.
  2689. 3. Have an execjob_begin, execjob_prologue hooks that don't
  2690. fail any of the sister moms.
  2691. when executed by mom managing vnodes in (C).
  2692. 4. Then create an execjob_launch that prunes back the job's
  2693. exec_vnode assignment back to satisfying the original 3-node
  2694. select spec, choosing only healthy nodes.
  2695. 5. Result:
  2696. a. This results in the following reassignment of chunks:
  2697. exec_vnode = (A)+(B)+(C)
  2698. b. The accounting log start record 'S' will reflect the
  2699. select request where additional chunks were added, while
  2700. the secondary start record 's' will reflect the assigned
  2701. resources after pruning the original select request via
  2702. the pbs.release_nodes(keep_select=...) call
  2703. inside execjob_launch hook.
  2704. """
  2705. # instantiate queuejob hook
  2706. hook_event = "queuejob"
  2707. hook_name = "qjob"
  2708. a = {'event': hook_event, 'enabled': 'true'}
  2709. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  2710. # instantiate execjob_begin hook
  2711. hook_event = "execjob_begin"
  2712. hook_name = "begin"
  2713. a = {'event': hook_event, 'enabled': 'true'}
  2714. self.server.create_import_hook(hook_name, a, self.begin_hook_body2)
  2715. # instantiate execjob_prologue hook
  2716. hook_event = "execjob_prologue"
  2717. hook_name = "prolo"
  2718. a = {'event': hook_event, 'enabled': 'true'}
  2719. self.server.create_import_hook(hook_name, a, self.prolo_hook_body3)
  2720. # instantiate execjob_launch hook
  2721. hook_event = "execjob_launch"
  2722. hook_name = "launch"
  2723. a = {'event': hook_event, 'enabled': 'true'}
  2724. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  2725. # First, turn off scheduling
  2726. a = {'scheduling': 'false'}
  2727. self.server.manager(MGR_CMD_SET, SERVER, a)
  2728. jid = self.create_and_submit_job('job1')
  2729. # Job gets queued and reflects the incremented values from queuejob
  2730. # hook
  2731. self.server.expect(JOB, {'job_state': 'Q',
  2732. 'tolerate_node_failures': 'job_start',
  2733. 'Resource_List.mem': '10gb',
  2734. 'Resource_List.ncpus': 13,
  2735. 'Resource_List.nodect': 5,
  2736. 'Resource_List.select': self.job1_iselect,
  2737. 'Resource_List.site': self.job1_oselect,
  2738. 'Resource_List.place': self.job1_place,
  2739. 'schedselect': self.job1_ischedselect},
  2740. id=jid, attrop=PTL_AND)
  2741. a = {'scheduling': 'true'}
  2742. self.server.manager(MGR_CMD_SET, SERVER, a)
  2743. # Job eventually launches reflecting the pruned back values
  2744. # to the original select spec
  2745. # There's a max_attempts=60 for it would take up to 60 seconds
  2746. # for primary mom to wait for the sisters to join
  2747. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  2748. # sisters to execjob_prologue hooks (default $job_launch_delay
  2749. # value of 30 seconds)
  2750. self.server.expect(JOB, {'job_state': 'R',
  2751. 'tolerate_node_failures': 'job_start',
  2752. 'Resource_List.mem': '6gb',
  2753. 'Resource_List.ncpus': 8,
  2754. 'Resource_List.nodect': 3,
  2755. 'Resource_List.select': self.job1v5_select,
  2756. 'Resource_List.place': self.job1_place,
  2757. 'schedselect': self.job1v5_schedselect,
  2758. 'exec_host': self.job1v5_exec_host,
  2759. 'exec_vnode': self.job1v5_exec_vnode},
  2760. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  2761. thisjob = self.server.status(JOB, id=jid)
  2762. if thisjob:
  2763. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  2764. # Check various vnode status.
  2765. jobs_assn1 = "%s/0" % (jid,)
  2766. self.match_vnode_status(
  2767. [self.nAv0, self.nAv1, self.nB, self.nBv0],
  2768. 'job-busy', jobs_assn1, 1, '1048576kb')
  2769. self.match_vnode_status([self.nAv2, self.nBv1],
  2770. 'job-busy', jobs_assn1, 1, '0kb')
  2771. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  2772. self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
  2773. 2, '2097152kb')
  2774. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  2775. self.nE, self.nEv0, self.nEv1, self.nEv2,
  2776. self.nEv3], 'free')
  2777. # Check server/queue counts.
  2778. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2779. 'resources_assigned.mem': '6291456kb'},
  2780. attrop=PTL_AND)
  2781. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2782. 'resources_assigned.mem': '6291456kb'},
  2783. id='workq', attrop=PTL_AND)
  2784. self.assertTrue(
  2785. self.pbs_nodefile_match_exec_host(jid, self.job1v5_exec_host))
  2786. # Check vnode_list[] parameter in execjob_prologue hook
  2787. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  2788. self.nB, self.nBv0, self.nBv1,
  2789. self.nC, self.nD, self.nE, self.nEv0]
  2790. for vn in vnode_list:
  2791. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  2792. jid, vn), n=10)
  2793. # Check vnode_list[] parameter in execjob_launch hook
  2794. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  2795. self.nB, self.nBv0, self.nBv1,
  2796. self.nC, self.nD, self.nE, self.nEv0]
  2797. for vn in vnode_list:
  2798. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  2799. jid, vn), n=10)
  2800. # Check result of pbs.event().job.release_nodes(keep_select) call
  2801. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  2802. jid, self.job1v5_exec_vnode), n=10)
  2803. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  2804. jid, self.job1v5_schedselect), n=10)
  2805. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  2806. jid, self.job1_iexec_vnode), n=10)
  2807. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  2808. jid, self.job1v5_exec_vnode), n=10)
  2809. # Check accounting_logs
  2810. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  2811. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  2812. self.job1_place,
  2813. self.job1_isel_esc)
  2814. self.match_accounting_log('s', jid, self.job1v5_exec_host_esc,
  2815. self.job1v5_exec_vnode_esc,
  2816. "6gb", 8, 3,
  2817. self.job1_place,
  2818. self.job1v5_sel_esc)
  2819. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  2820. n=10, max_attempts=60, interval=2, regexp=True)
  2821. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  2822. n=10, max_attempts=10, interval=2)
  2823. # validate output
  2824. expected_out = """/var/spool/pbs/aux/%s
  2825. %s
  2826. %s
  2827. %s
  2828. FIB TESTS
  2829. pbsdsh -n 1 fib 37
  2830. %d
  2831. pbsdsh -n 2 fib 37
  2832. %d
  2833. fib 37
  2834. %d
  2835. HOSTNAME TESTS
  2836. pbsdsh -n 0 hostname
  2837. %s
  2838. pbsdsh -n 1 hostname
  2839. %s
  2840. pbsdsh -n 2 hostname
  2841. %s
  2842. PBS_NODEFILE tests
  2843. HOST=%s
  2844. pbs_tmrsh %s hostname
  2845. %s
  2846. HOST=%s
  2847. pbs_tmrsh %s hostname
  2848. %s
  2849. HOST=%s
  2850. pbs_tmrsh %s hostname
  2851. %s
  2852. """ % (jid, self.momA.hostname, self.momB.hostname, self.momC.hostname,
  2853. self.fib37_value, self.fib37_value, self.fib37_value,
  2854. self.momA.shortname, self.momB.shortname, self.momC.shortname,
  2855. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  2856. self.momB.hostname, self.momB.hostname, self.momB.shortname,
  2857. self.momC.hostname, self.momC.hostname, self.momC.shortname)
  2858. self.logger.info("expected out=%s" % (expected_out,))
  2859. job_out = ""
  2860. with open(job_output_file, 'r') as fd:
  2861. job_out = fd.read()
  2862. self.logger.info("job_out=%s" % (job_out,))
  2863. self.assertEquals(job_out, expected_out)
  2864. @timeout(400)
  2865. def test_t9(self):
  2866. """
  2867. Test tolerating 'all' node failures at job startup and
  2868. within the life of the job.
  2869. 1. Submit a job that has been submitted with a select
  2870. spec of 2 super-chunks say (A) and (B), and 1 chunk
  2871. of (C), along with place spec of "scatter",
  2872. resulting in the following assignment:
  2873. exec_vnode = (A)+(B)+(C)
  2874. and -Wtolerate_node_failures=all
  2875. 2. Have a queuejob hook that adds 1 extra node to each
  2876. chunk (except the MS (first) chunk), resulting in the
  2877. assignment:
  2878. exec_vnode = (A)+(B)+(D)+(C)+(E)
  2879. where D mirrors super-chunk B specs while E mirrors
  2880. chunk C.
  2881. 3. Have an execjob_begin hook that fails (causes rejection)
  2882. when executed by mom managing vnodes in (B).
  2883. 4. Have an execjob_prologue hook that fails (causes rejection)
  2884. when executed by mom managing vnodes in (C).
  2885. 5. Then create an execjob_launch that prunes back the job's
  2886. exec_vnode assignment back to satisfying the original 3-node
  2887. select spec, choosing only healthy nodes.
  2888. 6. Now kill -KILL mom host hostD.
  2889. 7. Result:
  2890. a. This results in the following reassignment of chunks:
  2891. exec_vnode = (A)+(D)+(E)
  2892. since (B) and (C) contain vnodes from failed moms.
  2893. b. Job continues to run even after nodeD goes down with
  2894. only an indication in mom_logs with the message:
  2895. im_eof, Premature end of message from addr n stream 4
  2896. """
  2897. # set this so as to not linger on delaying job kill.
  2898. c = {'$max_poll_downtime': 10}
  2899. self.momA.add_config(c)
  2900. # instantiate queuejob hook, tolerate_node_failure is set to 'all'
  2901. hook_event = "queuejob"
  2902. hook_name = "qjob"
  2903. a = {'event': hook_event, 'enabled': 'true'}
  2904. self.server.create_import_hook(hook_name, a, self.qjob_hook_body2)
  2905. # instantiate execjob_begin hook
  2906. hook_event = "execjob_begin"
  2907. hook_name = "begin"
  2908. a = {'event': hook_event, 'enabled': 'true'}
  2909. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  2910. # instantiate execjob_prologue hook
  2911. hook_event = "execjob_prologue"
  2912. hook_name = "prolo"
  2913. a = {'event': hook_event, 'enabled': 'true'}
  2914. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  2915. # instantiate execjob_launch hook
  2916. hook_event = "execjob_launch"
  2917. hook_name = "launch"
  2918. a = {'event': hook_event, 'enabled': 'true'}
  2919. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  2920. # First, turn off scheduling
  2921. a = {'scheduling': 'false'}
  2922. self.server.manager(MGR_CMD_SET, SERVER, a)
  2923. jid = self.create_and_submit_job('job1_2')
  2924. # Job gets queued and reflects the incremented values from queuejob
  2925. # hook
  2926. self.server.expect(JOB, {'job_state': 'Q',
  2927. 'tolerate_node_failures': 'all',
  2928. 'Resource_List.mem': '10gb',
  2929. 'Resource_List.ncpus': 13,
  2930. 'Resource_List.nodect': 5,
  2931. 'Resource_List.select': self.job1_iselect,
  2932. 'Resource_List.site': self.job1_oselect,
  2933. 'Resource_List.place': self.job1_place,
  2934. 'schedselect': self.job1_ischedselect},
  2935. id=jid, attrop=PTL_AND)
  2936. a = {'scheduling': 'true'}
  2937. self.server.manager(MGR_CMD_SET, SERVER, a)
  2938. # Job eventually launches reflecting the pruned back values
  2939. # to the original select spec
  2940. # There's a max_attempts=60 for it would take up to 60 seconds
  2941. # for primary mom to wait for the sisters to join
  2942. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  2943. # sisters to execjob_prologue hooks (default $job_launch_delay
  2944. # value of 30 seconds)
  2945. self.server.expect(JOB, {'job_state': 'R',
  2946. 'tolerate_node_failures': 'all',
  2947. 'Resource_List.mem': '6gb',
  2948. 'Resource_List.ncpus': 8,
  2949. 'Resource_List.nodect': 3,
  2950. 'Resource_List.select': self.job1_select,
  2951. 'Resource_List.place': self.job1_place,
  2952. 'schedselect': self.job1_schedselect,
  2953. 'exec_host': self.job1_exec_host,
  2954. 'exec_vnode': self.job1_exec_vnode},
  2955. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  2956. thisjob = self.server.status(JOB, id=jid)
  2957. if thisjob:
  2958. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  2959. # Check various vnode status.
  2960. jobs_assn1 = "%s/0" % (jid,)
  2961. self.match_vnode_status(
  2962. [self.nAv0, self.nAv1, self.nE, self.nEv0],
  2963. 'job-busy', jobs_assn1, 1, '1048576kb')
  2964. self.match_vnode_status([self.nAv2],
  2965. 'job-busy', jobs_assn1, 1, '0kb')
  2966. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  2967. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  2968. 3, '2097152kb')
  2969. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  2970. self.nBv1, self.nBv2, self.nBv3, self.nC,
  2971. self.nEv1, self.nEv2, self.nEv3], 'free')
  2972. # Check server/queue counts.
  2973. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  2974. 'resources_assigned.mem': '6291456kb'},
  2975. attrop=PTL_AND)
  2976. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  2977. 'resources_assigned.mem': '6291456kb'},
  2978. id='workq', attrop=PTL_AND)
  2979. self.assertTrue(
  2980. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  2981. # Verify mom_logs
  2982. self.momA.log_match(
  2983. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  2984. jid, self.hostB), n=10, regexp=True)
  2985. self.momA.log_match(
  2986. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  2987. "is tolerant of node failures",
  2988. regexp=True, n=10)
  2989. self.momA.log_match(
  2990. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  2991. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  2992. self.momA.log_match(
  2993. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  2994. "as job is tolerant of node failures", n=10, regexp=True)
  2995. # Check vnode_list[] parameter in execjob_prologue hook
  2996. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  2997. self.nB, self.nBv0, self.nBv1,
  2998. self.nC, self.nD, self.nE, self.nEv0]
  2999. for vn in vnode_list:
  3000. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  3001. jid, vn), n=10)
  3002. # Check vnode_list_fail[] parameter in execjob_prologue hook
  3003. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  3004. for vn in vnode_list_fail:
  3005. self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
  3006. jid, vn), n=10)
  3007. # Check vnode_list[] parameter in execjob_launch hook
  3008. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3009. self.nB, self.nBv0, self.nBv1,
  3010. self.nC, self.nD, self.nE, self.nEv0]
  3011. for vn in vnode_list:
  3012. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  3013. jid, vn), n=10)
  3014. # Check vnode_list_fail[] parameter in execjob_launch hook
  3015. vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
  3016. for vn in vnode_list_fail:
  3017. self.momA.log_match(
  3018. "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
  3019. # Check result of pbs.event().job.release_nodes(keep_select) call
  3020. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  3021. jid, self.job1_exec_vnode), n=10)
  3022. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  3023. jid, self.job1_schedselect), n=10)
  3024. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  3025. jid, self.job1_iexec_vnode), n=10)
  3026. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  3027. jid, self.job1_exec_vnode), n=10)
  3028. # Check accounting_logs
  3029. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  3030. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  3031. self.job1_place,
  3032. self.job1_isel_esc)
  3033. self.match_accounting_log('s', jid, self.job1_exec_host_esc,
  3034. self.job1_exec_vnode_esc,
  3035. "6gb", 8, 3,
  3036. self.job1_place,
  3037. self.job1_sel_esc)
  3038. # temporarily suspend momD, simulating a failed mom host.
  3039. self.momD.signal("-KILL")
  3040. self.momA.log_match("im_eof, Premature end of message.+on stream 4",
  3041. n=10, max_attempts=30, interval=2, regexp=True)
  3042. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  3043. n=10, max_attempts=60, interval=2, regexp=True)
  3044. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  3045. n=10, max_attempts=10, interval=2)
  3046. # validate output
  3047. expected_out = """/var/spool/pbs/aux/%s
  3048. %s
  3049. %s
  3050. %s
  3051. FIB TESTS
  3052. pbsdsh -n 2 fib 37
  3053. %d
  3054. fib 37
  3055. %d
  3056. HOSTNAME TESTS
  3057. pbsdsh -n 0 hostname
  3058. %s
  3059. pbsdsh -n 2 hostname
  3060. %s
  3061. """ % (jid, self.momA.hostname, self.momD.hostname, self.momE.hostname,
  3062. self.fib37_value, self.fib37_value, self.momA.shortname,
  3063. self.momE.shortname)
  3064. self.logger.info("expected out=%s" % (expected_out,))
  3065. job_out = ""
  3066. with open(job_output_file, 'r') as fd:
  3067. job_out = fd.read()
  3068. self.logger.info("job_out=%s" % (job_out,))
  3069. self.assertEquals(job_out, expected_out)
  3070. self.momD.start()
  3071. def test_t10(self):
  3072. """
  3073. Test tolerating node failures at job startup but also
  3074. cause a failure on one of the nodes after the job has
  3075. started.
  3076. 1. Submit a job that has been submitted with a select
  3077. spec of 2 super-chunks say (A) and (B), and 1 chunk
  3078. of (C), along with place spec of "scatter",
  3079. resulting in the following assignment:
  3080. exec_vnode = (A)+(B)+(C)
  3081. and -Wtolerate_node_failures=all
  3082. 2. Have a queuejob hook that adds 1 extra node to each
  3083. chunk (except the MS (first) chunk), resulting in the
  3084. assignment:
  3085. exec_vnode = (A)+(B)+(D)+(C)+(E)
  3086. where D mirrors super-chunk B specs while E mirrors
  3087. chunk C.
  3088. 3. Have an execjob_begin hook that fails (causes rejection)
  3089. when executed by mom managing vnodes in (B).
  3090. 4. Have an execjob_prologue hook that fails (causes rejection)
  3091. when executed by mom managing vnodes in (C).
  3092. 5. Then create an execjob_launch that prunes back the job's
  3093. exec_vnode assignment back to satisfying the original 3-node
  3094. select spec, choosing only healthy nodes.
  3095. 6. Now kill -KILL mom host hostD.
  3096. 7. Result:
  3097. a. This results in the following reassignment of chunks:
  3098. exec_vnode = (A)+(D)+(E)
  3099. since (B) and (C) contain vnodes from failed moms.
  3100. b. Job eventually aborts after nodeD goes down with
  3101. an indication in mom_logs with the message:
  3102. "im_eof, lost communication with <host>"
  3103. "node EOF 1 (<host>)"
  3104. "kill_job"
  3105. """
  3106. # set this so as to not linger on delaying job kill.
  3107. c = {'$max_poll_downtime': 10}
  3108. self.momA.add_config(c)
  3109. # instantiate queuejob hook
  3110. hook_event = "queuejob"
  3111. hook_name = "qjob"
  3112. a = {'event': hook_event, 'enabled': 'true'}
  3113. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  3114. # instantiate execjob_begin hook
  3115. hook_event = "execjob_begin"
  3116. hook_name = "begin"
  3117. a = {'event': hook_event, 'enabled': 'true'}
  3118. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  3119. # instantiate execjob_prologue hook
  3120. hook_event = "execjob_prologue"
  3121. hook_name = "prolo"
  3122. a = {'event': hook_event, 'enabled': 'true'}
  3123. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  3124. # instantiate execjob_launch hook
  3125. hook_event = "execjob_launch"
  3126. hook_name = "launch"
  3127. a = {'event': hook_event, 'enabled': 'true'}
  3128. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  3129. # First, turn off scheduling
  3130. a = {'scheduling': 'false'}
  3131. self.server.manager(MGR_CMD_SET, SERVER, a)
  3132. jid = self.create_and_submit_job('job1_3')
  3133. # Job gets queued and reflects the incremented values from queuejob
  3134. # hook
  3135. self.server.expect(JOB, {'job_state': 'Q',
  3136. 'tolerate_node_failures': 'job_start',
  3137. 'Resource_List.mem': '10gb',
  3138. 'Resource_List.ncpus': 13,
  3139. 'Resource_List.nodect': 5,
  3140. 'Resource_List.select': self.job1_iselect,
  3141. 'Resource_List.site': self.job1_oselect,
  3142. 'Resource_List.place': self.job1_place,
  3143. 'schedselect': self.job1_ischedselect},
  3144. id=jid, attrop=PTL_AND)
  3145. a = {'scheduling': 'true'}
  3146. self.server.manager(MGR_CMD_SET, SERVER, a)
  3147. # Job eventually launches reflecting the pruned back values
  3148. # to the original select spec
  3149. # There's a max_attempts=60 for it would take up to 60 seconds
  3150. # for primary mom to wait for the sisters to join
  3151. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  3152. # sisters to execjob_prologue hooks (default $job_launch_delay
  3153. # value of 30 seconds)
  3154. self.server.expect(JOB, {'job_state': 'R',
  3155. 'tolerate_node_failures': 'job_start',
  3156. 'Resource_List.mem': '6gb',
  3157. 'Resource_List.ncpus': 8,
  3158. 'Resource_List.nodect': 3,
  3159. 'Resource_List.select': self.job1_select,
  3160. 'Resource_List.place': self.job1_place,
  3161. 'schedselect': self.job1_schedselect,
  3162. 'exec_host': self.job1_exec_host,
  3163. 'exec_vnode': self.job1_exec_vnode},
  3164. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  3165. # Check various vnode status.
  3166. jobs_assn1 = "%s/0" % (jid,)
  3167. self.match_vnode_status(
  3168. [self.nAv0, self.nAv1, self.nE, self.nEv0],
  3169. 'job-busy', jobs_assn1, 1, '1048576kb')
  3170. self.match_vnode_status([self.nAv2],
  3171. 'job-busy', jobs_assn1, 1, '0kb')
  3172. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  3173. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  3174. 3, '2097152kb')
  3175. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  3176. self.nBv1, self.nBv2, self.nBv3, self.nC,
  3177. self.nEv1, self.nEv2, self.nEv3], 'free')
  3178. # Check server/queue counts.
  3179. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  3180. 'resources_assigned.mem': '6291456kb'},
  3181. attrop=PTL_AND)
  3182. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  3183. 'resources_assigned.mem': '6291456kb'},
  3184. id='workq', attrop=PTL_AND)
  3185. self.assertTrue(
  3186. self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
  3187. # Verify mom_logs
  3188. self.momA.log_match(
  3189. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  3190. jid, self.hostB), n=10, regexp=True)
  3191. self.momA.log_match(
  3192. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  3193. "is tolerant of node failures",
  3194. regexp=True, n=10)
  3195. self.momA.log_match(
  3196. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  3197. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  3198. self.momA.log_match(
  3199. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  3200. "as job is tolerant of node failures", n=10, regexp=True)
  3201. # Check vnode_list[] parameter in execjob_prologue hook
  3202. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3203. self.nB, self.nBv0, self.nBv1,
  3204. self.nC, self.nD, self.nE, self.nEv0]
  3205. for vn in vnode_list:
  3206. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  3207. jid, vn), n=10)
  3208. # Check vnode_list_fail[] parameter in execjob_prologue hook
  3209. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  3210. for vn in vnode_list_fail:
  3211. self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
  3212. jid, vn), n=10)
  3213. # Check vnode_list[] parameter in execjob_launch hook
  3214. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3215. self.nB, self.nBv0, self.nBv1,
  3216. self.nC, self.nD, self.nE, self.nEv0]
  3217. for vn in vnode_list:
  3218. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  3219. jid, vn), n=10)
  3220. # Check vnode_list_fail[] parameter in execjob_launch hook
  3221. vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
  3222. for vn in vnode_list_fail:
  3223. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  3224. jid, vn), n=10)
  3225. # Check result of pbs.event().job.release_nodes(keep_select) call
  3226. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  3227. jid, self.job1_exec_vnode), n=10)
  3228. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  3229. jid, self.job1_schedselect), n=10)
  3230. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  3231. jid, self.job1_iexec_vnode), n=10)
  3232. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  3233. jid, self.job1_exec_vnode), n=10)
  3234. # Check accounting_logs
  3235. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  3236. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  3237. self.job1_place,
  3238. self.job1_isel_esc)
  3239. self.match_accounting_log('s', jid, self.job1_exec_host_esc,
  3240. self.job1_exec_vnode_esc,
  3241. "6gb", 8, 3,
  3242. self.job1_place,
  3243. self.job1_sel_esc)
  3244. # temporarily suspend momD, simulating a failed mom host.
  3245. self.momD.signal("-KILL")
  3246. self.momA.log_match(
  3247. "Job;%s;im_eof, lost communication with %s.+killing job now" % (
  3248. jid, self.nD), n=10, max_attempts=30, interval=2, regexp=True)
  3249. self.momA.log_match("Job;%s;kill_job" % (jid,),
  3250. n=10, max_attempts=60, interval=2)
  3251. self.momD.start()
  3252. def test_t11(self):
  3253. """
  3254. Test: tolerating node failures at job startup with
  3255. job having an ncpus=0 assignment. This ensures
  3256. the hooks will have the info for the ncpus=0 chunks
  3257. in pbs.event().vnode_list[].
  3258. """
  3259. # instantiate queuejob hook
  3260. hook_event = "queuejob"
  3261. hook_name = "qjob"
  3262. a = {'event': hook_event, 'enabled': 'true'}
  3263. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  3264. # instantiate execjob_begin hook
  3265. hook_event = "execjob_begin"
  3266. hook_name = "begin"
  3267. a = {'event': hook_event, 'enabled': 'true'}
  3268. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  3269. # instantiate execjob_prologue hook
  3270. hook_event = "execjob_prologue"
  3271. hook_name = "prolo"
  3272. a = {'event': hook_event, 'enabled': 'true'}
  3273. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  3274. # instantiate execjob_launch hook
  3275. hook_event = "execjob_launch"
  3276. hook_name = "launch"
  3277. a = {'event': hook_event, 'enabled': 'true'}
  3278. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  3279. # First, turn off scheduling
  3280. a = {'scheduling': 'false'}
  3281. self.server.manager(MGR_CMD_SET, SERVER, a)
  3282. jid = self.create_and_submit_job('job2')
  3283. # Job gets queued and reflects the incremented values from queuejob
  3284. # hook
  3285. self.server.expect(JOB, {'job_state': 'Q',
  3286. 'tolerate_node_failures': 'job_start',
  3287. 'Resource_List.mem': '10gb',
  3288. 'Resource_List.ncpus': 9,
  3289. 'Resource_List.nodect': 5,
  3290. 'Resource_List.select': self.job2_iselect,
  3291. 'Resource_List.site': self.job2_oselect,
  3292. 'Resource_List.place': self.job2_place,
  3293. 'schedselect': self.job2_ischedselect},
  3294. max_attempts=10, id=jid, attrop=PTL_AND)
  3295. a = {'scheduling': 'true'}
  3296. self.server.manager(MGR_CMD_SET, SERVER, a)
  3297. # Job eventually launches reflecting the pruned back values
  3298. # to the original select spec
  3299. # There's a max_attempts=60 for it would take up to 60 seconds
  3300. # for primary mom to wait for the sisters to join
  3301. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  3302. # sisters to execjob_prologue hooks (default $job_launch_delay
  3303. # value of 30 seconds)
  3304. self.server.expect(JOB, {'job_state': 'R',
  3305. 'tolerate_node_failures': 'job_start',
  3306. 'Resource_List.mem': '6gb',
  3307. 'Resource_List.ncpus': 6,
  3308. 'Resource_List.nodect': 3,
  3309. 'Resource_List.select': self.job2_select,
  3310. 'Resource_List.place': self.job2_place,
  3311. 'schedselect': self.job2_schedselect,
  3312. 'exec_host': self.job2_exec_host,
  3313. 'exec_vnode': self.job2_exec_vnode},
  3314. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  3315. # Check various vnode status.
  3316. jobs_assn1 = "%s/0" % (jid,)
  3317. self.match_vnode_status([self.nAv0, self.nAv1],
  3318. 'job-busy', jobs_assn1, 1, '1048576kb')
  3319. self.match_vnode_status([self.nE, self.nEv0],
  3320. 'free', jobs_assn1, 0, '1048576kb')
  3321. self.match_vnode_status([self.nAv2],
  3322. 'job-busy', jobs_assn1, 1, '0kb')
  3323. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  3324. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  3325. 3, '2097152kb')
  3326. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  3327. self.nBv1, self.nBv2, self.nBv3, self.nC,
  3328. self.nEv1, self.nEv2, self.nEv3], 'free')
  3329. # Check server/queue counts.
  3330. self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
  3331. 'resources_assigned.mem': '6291456kb'},
  3332. attrop=PTL_AND)
  3333. self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
  3334. 'resources_assigned.mem': '6291456kb'},
  3335. id='workq', attrop=PTL_AND)
  3336. self.assertTrue(
  3337. self.pbs_nodefile_match_exec_host(jid, self.job2_exec_host_nfile))
  3338. # Verify mom_logs
  3339. self.momA.log_match(
  3340. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  3341. jid, self.hostB), n=10, regexp=True)
  3342. self.momA.log_match(
  3343. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  3344. "is tolerant of node failures",
  3345. regexp=True, n=10)
  3346. self.momA.log_match(
  3347. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  3348. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  3349. self.momA.log_match(
  3350. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  3351. "as job is tolerant of node failures", n=10, regexp=True)
  3352. # Check vnode_list[] parameter in execjob_prologue hook
  3353. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3354. self.nB, self.nBv0, self.nBv1,
  3355. self.nC, self.nD, self.nE, self.nEv0]
  3356. for vn in vnode_list:
  3357. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  3358. jid, vn), n=10)
  3359. # Check vnode_list_fail[] parameter in execjob_prologue hook
  3360. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  3361. for vn in vnode_list_fail:
  3362. self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
  3363. jid, vn), n=10)
  3364. # Check vnode_list[] parameter in execjob_launch hook
  3365. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3366. self.nB, self.nBv0, self.nBv1,
  3367. self.nC, self.nD, self.nE, self.nEv0]
  3368. for vn in vnode_list:
  3369. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  3370. jid, vn), n=10)
  3371. # Check vnode_list_fail[] parameter in execjob_launch hook
  3372. vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
  3373. for vn in vnode_list_fail:
  3374. self.momA.log_match(
  3375. "Job;%s;launch: found vnode_list_fail[%s]" % (jid, vn), n=10)
  3376. # Check result of pbs.event().job.release_nodes(keep_select) call
  3377. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  3378. jid, self.job2_exec_vnode), n=10)
  3379. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  3380. jid, self.job2_schedselect), n=10)
  3381. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  3382. jid, self.job2_iexec_vnode), n=10)
  3383. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  3384. jid, self.job2_exec_vnode), n=10)
  3385. # Check accounting_logs
  3386. self.match_accounting_log('S', jid, self.job2_iexec_host_esc,
  3387. self.job2_iexec_vnode_esc, "10gb", 9, 5,
  3388. self.job2_place,
  3389. self.job2_isel_esc)
  3390. self.match_accounting_log('s', jid, self.job2_exec_host_esc,
  3391. self.job2_exec_vnode_esc,
  3392. "6gb", 6, 3,
  3393. self.job2_place,
  3394. self.job2_sel_esc)
  3395. def test_t12(self):
  3396. """
  3397. Test: tolerating node failures at job startup with
  3398. extra resources requested such as mpiprocs and
  3399. ompthtreads which would affect content of $PBS_NODEFILE.
  3400. """
  3401. # instantiate queuejob hook
  3402. hook_event = "queuejob"
  3403. hook_name = "qjob"
  3404. a = {'event': hook_event, 'enabled': 'true'}
  3405. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  3406. # instantiate execjob_begin hook
  3407. hook_event = "execjob_begin"
  3408. hook_name = "begin"
  3409. a = {'event': hook_event, 'enabled': 'true'}
  3410. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  3411. # instantiate execjob_prologue hook
  3412. hook_event = "execjob_prologue"
  3413. hook_name = "prolo"
  3414. a = {'event': hook_event, 'enabled': 'true'}
  3415. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  3416. # instantiate execjob_launch hook
  3417. hook_event = "execjob_launch"
  3418. hook_name = "launch"
  3419. a = {'event': hook_event, 'enabled': 'true'}
  3420. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  3421. # First, turn off scheduling
  3422. a = {'scheduling': 'false'}
  3423. self.server.manager(MGR_CMD_SET, SERVER, a)
  3424. jid = self.create_and_submit_job('job3')
  3425. # Job gets queued and reflects the incremented values from queuejob
  3426. # hook
  3427. self.server.expect(JOB, {'job_state': 'Q',
  3428. 'tolerate_node_failures': 'job_start',
  3429. 'Resource_List.mem': '10gb',
  3430. 'Resource_List.ncpus': 13,
  3431. 'Resource_List.nodect': 5,
  3432. 'Resource_List.select': self.job3_iselect,
  3433. 'Resource_List.site': self.job3_oselect,
  3434. 'Resource_List.place': self.job3_place,
  3435. 'schedselect': self.job3_ischedselect},
  3436. max_attempts=10, id=jid, attrop=PTL_AND)
  3437. a = {'scheduling': 'true'}
  3438. self.server.manager(MGR_CMD_SET, SERVER, a)
  3439. # Job eventually launches reflecting the pruned back values
  3440. # to the original select spec
  3441. # There's a max_attempts=60 for it would take up to 60 seconds
  3442. # for primary mom to wait for the sisters to join
  3443. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  3444. # sisters to execjob_prologue hooks (default $job_launch_delay
  3445. # value of 30 seconds)
  3446. self.server.expect(JOB, {'job_state': 'R',
  3447. 'tolerate_node_failures': 'job_start',
  3448. 'Resource_List.mem': '6gb',
  3449. 'Resource_List.ncpus': 8,
  3450. 'Resource_List.nodect': 3,
  3451. 'Resource_List.select': self.job3_select,
  3452. 'Resource_List.place': self.job3_place,
  3453. 'schedselect': self.job3_schedselect,
  3454. 'exec_host': self.job3_exec_host,
  3455. 'exec_vnode': self.job3_exec_vnode},
  3456. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  3457. # Check various vnode status.
  3458. jobs_assn1 = "%s/0" % (jid,)
  3459. self.match_vnode_status([self.nAv0, self.nAv1],
  3460. 'job-busy', jobs_assn1, 1, '1048576kb')
  3461. self.match_vnode_status([self.nE, self.nEv0],
  3462. 'job-busy', jobs_assn1, 1, '1048576kb')
  3463. self.match_vnode_status([self.nAv2],
  3464. 'job-busy', jobs_assn1, 1, '0kb')
  3465. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  3466. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  3467. 3, '2097152kb')
  3468. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  3469. self.nBv1, self.nBv2, self.nBv3, self.nC,
  3470. self.nEv1, self.nEv2, self.nEv3], 'free')
  3471. # Check server/queue counts.
  3472. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  3473. 'resources_assigned.mem': '6291456kb'},
  3474. attrop=PTL_AND)
  3475. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  3476. 'resources_assigned.mem': '6291456kb'},
  3477. id='workq', attrop=PTL_AND)
  3478. self.assertTrue(
  3479. self.pbs_nodefile_match_exec_host(jid, self.job3_exec_host,
  3480. self.job3_schedselect))
  3481. # Verify mom_logs
  3482. self.momA.log_match(
  3483. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  3484. jid, self.hostB), n=10, regexp=True)
  3485. self.momA.log_match(
  3486. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  3487. "is tolerant of node failures",
  3488. regexp=True, n=10)
  3489. self.momA.log_match(
  3490. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  3491. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  3492. self.momA.log_match(
  3493. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  3494. "as job is tolerant of node failures", n=10, regexp=True)
  3495. # Check vnode_list[] parameter in execjob_prologue hook
  3496. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3497. self.nB, self.nBv0, self.nBv1,
  3498. self.nC, self.nD, self.nE, self.nEv0]
  3499. for vn in vnode_list:
  3500. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  3501. jid, vn), n=10)
  3502. # Check vnode_list_fail[] parameter in execjob_prologue hook
  3503. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  3504. for vn in vnode_list_fail:
  3505. self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
  3506. jid, vn), n=10)
  3507. # Check vnode_list[] parameter in execjob_launch hook
  3508. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3509. self.nB, self.nBv0, self.nBv1,
  3510. self.nC, self.nD, self.nE, self.nEv0]
  3511. for vn in vnode_list:
  3512. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  3513. jid, vn), n=10)
  3514. # Check vnode_list_fail[] parameter in execjob_launch hook
  3515. vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
  3516. for vn in vnode_list_fail:
  3517. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  3518. jid, vn), n=10)
  3519. # Check result of pbs.event().job.release_nodes(keep_select) call
  3520. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  3521. jid, self.job3_exec_vnode), n=10)
  3522. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  3523. jid, self.job3_schedselect), n=10)
  3524. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  3525. jid, self.job3_iexec_vnode), n=10)
  3526. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  3527. jid, self.job3_exec_vnode), n=10)
  3528. # Check accounting_logs
  3529. self.match_accounting_log('S', jid, self.job3_iexec_host_esc,
  3530. self.job3_iexec_vnode_esc, "10gb", 13, 5,
  3531. self.job3_place,
  3532. self.job3_isel_esc)
  3533. self.match_accounting_log('s', jid, self.job3_exec_host_esc,
  3534. self.job3_exec_vnode_esc,
  3535. "6gb", 8, 3,
  3536. self.job3_place,
  3537. self.job3_sel_esc)
  3538. def test_t13(self):
  3539. """
  3540. Test: pbs.event().job.select.increment_chunks() method.
  3541. """
  3542. # instantiate queuejob hook
  3543. hook_body = """
  3544. import pbs
  3545. e=pbs.event()
  3546. sel=pbs.select("ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb")
  3547. inp=2
  3548. isel=sel.increment_chunks(inp)
  3549. pbs.logmsg(pbs.LOG_DEBUG, "sel=%s" % (sel,))
  3550. pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%d)=%s" % (inp,isel))
  3551. inp="3"
  3552. isel=sel.increment_chunks(inp)
  3553. pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
  3554. inp="23.5%"
  3555. isel=sel.increment_chunks(inp)
  3556. pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
  3557. inp={0: 0, 1: 4, 2: "50%"}
  3558. isel=sel.increment_chunks(inp)
  3559. pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
  3560. sel=pbs.select("5:ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb")
  3561. pbs.logmsg(pbs.LOG_DEBUG, "sel=%s" % (sel,))
  3562. inp={0: "50%", 1: "50%", 2: "50%"}
  3563. isel=sel.increment_chunks(inp)
  3564. pbs.logmsg(pbs.LOG_DEBUG, "sel.increment_chunks(%s)=%s" % (inp,isel))
  3565. """
  3566. hook_event = "queuejob"
  3567. hook_name = "qjob"
  3568. a = {'event': hook_event, 'enabled': 'true'}
  3569. self.server.create_import_hook(hook_name, a, hook_body)
  3570. a = {'scheduling': 'false'}
  3571. self.server.manager(MGR_CMD_SET, SERVER, a)
  3572. j1 = Job(TEST_USER)
  3573. j1.set_sleep_time(10)
  3574. self.server.submit(j1)
  3575. # Verify server_logs
  3576. self.server.log_match(
  3577. "sel=ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb", n=10)
  3578. self.server.log_match(
  3579. "sel.increment_chunks(2)=1:ncpus=3:mem=1gb+" +
  3580. "3:ncpus=2:mem=2gb+4:ncpus=1:mem=3gb", n=10)
  3581. self.server.log_match(
  3582. "sel.increment_chunks(3)=1:ncpus=3:mem=1gb+" +
  3583. "4:ncpus=2:mem=2gb+5:ncpus=1:mem=3gb", n=10)
  3584. self.server.log_match(
  3585. "sel.increment_chunks(23.5%)=1:ncpus=3:mem=1gb+" +
  3586. "2:ncpus=2:mem=2gb+3:ncpus=1:mem=3gb", n=10)
  3587. self.server.log_match(
  3588. "sel.increment_chunks({0: 0, 1: 4, 2: \'50%\'})=1:ncpus=3:" +
  3589. "mem=1gb+5:ncpus=2:mem=2gb+3:ncpus=1:mem=3gb", n=10)
  3590. self.server.log_match(
  3591. "sel=5:ncpus=3:mem=1gb+1:ncpus=2:mem=2gb+2:ncpus=1:mem=3gb",
  3592. n=10)
  3593. self.server.log_match(
  3594. "sel.increment_chunks({0: \'50%\', 1: \'50%\', 2: \'50%\'})=" +
  3595. "7:ncpus=3:mem=1gb+2:ncpus=2:mem=2gb+3:ncpus=1:mem=3gb", n=10)
  3596. def test_t14(self):
  3597. """
  3598. Test: tolerating job_start of no node failures,
  3599. but pruning job's assigned nodes to satisfy the original
  3600. select spec + 1 additional node.
  3601. Basically, given an original spec requiring
  3602. 3 nodes, and a queuejob hook has added 2 more nodes,
  3603. resulting in a new assignment:
  3604. exec_vnode=(A)+(B)+(C)+(D)+(E) where
  3605. (C) mirrors (B) and satisfy the second chunk, and (E)
  3606. mirrors (D) and satisfy the third chunk.
  3607. Now pruning the assigned nodes to need 4 nodes, would
  3608. result in:
  3609. exec_vnode=(A)+(B)+(D)+(e1)
  3610. where (E) is a super-chunk of the form (e1+e2) and only
  3611. need 'e1' part.
  3612. """
  3613. # instantiate queuejob hook
  3614. hook_event = "queuejob"
  3615. hook_name = "qjob"
  3616. a = {'event': hook_event, 'enabled': 'true'}
  3617. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  3618. # instantiate execjob_launch hook
  3619. hook_body = """
  3620. import pbs
  3621. e=pbs.event()
  3622. if 'PBS_NODEFILE' not in e.env:
  3623. e.accept()
  3624. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  3625. for vn in e.vnode_list:
  3626. v = e.vnode_list[vn]
  3627. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  3628. for vn in e.vnode_list_fail:
  3629. v = e.vnode_list_fail[vn]
  3630. pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
  3631. if e.job.in_ms_mom():
  3632. new_jsel = e.job.Resource_List["site"] + "+ncpus=1:mem=1gb"
  3633. pj = e.job.release_nodes(keep_select=new_jsel)
  3634. pbs.logmsg(pbs.LOG_DEBUG, "release_nodes(keep_select=%s)" % (new_jsel,))
  3635. if pj != None:
  3636. pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
  3637. pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
  3638. pbs.logjobmsg(e.job.id,
  3639. "launch: job.schedselect=%s" % (pj.schedselect,))
  3640. else:
  3641. e.job.delete()
  3642. msg = "unsuccessful at LAUNCH"
  3643. e.reject("unsuccessful at LAUNCH")
  3644. """
  3645. hook_event = "execjob_launch"
  3646. hook_name = "launch"
  3647. a = {'event': hook_event, 'enabled': 'true'}
  3648. self.server.create_import_hook(hook_name, a, hook_body)
  3649. # First, turn off scheduling
  3650. a = {'scheduling': 'false'}
  3651. self.server.manager(MGR_CMD_SET, SERVER, a)
  3652. jid = self.create_and_submit_job('job1_4')
  3653. # Job gets queued and reflects the incremented values from queuejob
  3654. # hook
  3655. self.server.expect(JOB, {'job_state': 'Q',
  3656. 'tolerate_node_failures': 'job_start',
  3657. 'Resource_List.mem': '10gb',
  3658. 'Resource_List.ncpus': 13,
  3659. 'Resource_List.nodect': 5,
  3660. 'Resource_List.select': self.job1_iselect,
  3661. 'Resource_List.site': self.job1_oselect,
  3662. 'Resource_List.place': self.job1_place,
  3663. 'schedselect': self.job1_ischedselect},
  3664. id=jid, attrop=PTL_AND)
  3665. a = {'scheduling': 'true'}
  3666. self.server.manager(MGR_CMD_SET, SERVER, a)
  3667. self.server.expect(JOB, {'job_state': 'R',
  3668. 'tolerate_node_failures': 'job_start',
  3669. 'Resource_List.mem': '7gb',
  3670. 'Resource_List.ncpus': 9,
  3671. 'Resource_List.nodect': 4,
  3672. 'Resource_List.select': self.job1v6_select,
  3673. 'Resource_List.place': self.job1_place,
  3674. 'schedselect': self.job1v6_schedselect,
  3675. 'exec_host': self.job1v6_exec_host,
  3676. 'exec_vnode': self.job1v6_exec_vnode},
  3677. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  3678. thisjob = self.server.status(JOB, id=jid)
  3679. if thisjob:
  3680. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  3681. # Check various vnode status.
  3682. jobs_assn1 = "%s/0" % (jid,)
  3683. self.match_vnode_status(
  3684. [self.nAv0, self.nAv1, self.nB, self.nBv0, self.nE],
  3685. 'job-busy', jobs_assn1, 1, '1048576kb')
  3686. self.match_vnode_status([self.nAv2, self.nBv1],
  3687. 'job-busy', jobs_assn1, 1, '0kb')
  3688. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  3689. self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
  3690. 2, '2097152kb')
  3691. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  3692. self.nEv0, self.nEv1, self.nEv2,
  3693. self.nEv3], 'free')
  3694. # Check server/queue counts.
  3695. self.server.expect(SERVER, {'resources_assigned.ncpus': 9,
  3696. 'resources_assigned.mem': '7340032kb'},
  3697. attrop=PTL_AND)
  3698. self.server.expect(QUEUE, {'resources_assigned.ncpus': 9,
  3699. 'resources_assigned.mem': '7340032kb'},
  3700. id='workq', attrop=PTL_AND)
  3701. self.assertTrue(
  3702. self.pbs_nodefile_match_exec_host(jid, self.job1v6_exec_host))
  3703. # Check vnode_list[] parameter in execjob_launch hook
  3704. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3705. self.nB, self.nBv0, self.nBv1,
  3706. self.nC, self.nD, self.nE, self.nEv0]
  3707. for vn in vnode_list:
  3708. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  3709. jid, vn), n=10)
  3710. # Check result of pbs.event().job.release_nodes(keep_select) call
  3711. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  3712. jid, self.job1v6_exec_vnode), n=10)
  3713. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  3714. jid, self.job1v6_schedselect), n=10)
  3715. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  3716. jid, self.job1_iexec_vnode), n=10)
  3717. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  3718. jid, self.job1v6_exec_vnode), n=10)
  3719. # Check accounting_logs
  3720. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  3721. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  3722. self.job1_place,
  3723. self.job1_isel_esc)
  3724. self.match_accounting_log('s', jid, self.job1v6_exec_host_esc,
  3725. self.job1v6_exec_vnode_esc,
  3726. "7gb", 9, 4,
  3727. self.job1_place,
  3728. self.job1v6_sel_esc)
  3729. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  3730. n=10, max_attempts=60, interval=2, regexp=True)
  3731. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  3732. n=10, max_attempts=10, interval=2)
  3733. # validate output
  3734. expected_out = """/var/spool/pbs/aux/%s
  3735. %s
  3736. %s
  3737. %s
  3738. %s
  3739. FIB TESTS
  3740. pbsdsh -n 1 fib 37
  3741. %d
  3742. pbsdsh -n 2 fib 37
  3743. %d
  3744. pbsdsh -n 3 fib 37
  3745. %d
  3746. fib 37
  3747. %d
  3748. HOSTNAME TESTS
  3749. pbsdsh -n 0 hostname
  3750. %s
  3751. pbsdsh -n 1 hostname
  3752. %s
  3753. pbsdsh -n 2 hostname
  3754. %s
  3755. pbsdsh -n 3 hostname
  3756. %s
  3757. PBS_NODEFILE tests
  3758. HOST=%s
  3759. pbs_tmrsh %s hostname
  3760. %s
  3761. HOST=%s
  3762. pbs_tmrsh %s hostname
  3763. %s
  3764. HOST=%s
  3765. pbs_tmrsh %s hostname
  3766. %s
  3767. HOST=%s
  3768. pbs_tmrsh %s hostname
  3769. %s
  3770. """ % (jid, self.momA.hostname, self.momB.hostname, self.momC.hostname,
  3771. self.momE.hostname,
  3772. self.fib37_value, self.fib37_value, self.fib37_value,
  3773. self.fib37_value,
  3774. self.momA.shortname, self.momB.shortname, self.momC.shortname,
  3775. self.momE.shortname,
  3776. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  3777. self.momB.hostname, self.momB.hostname, self.momB.shortname,
  3778. self.momC.hostname, self.momC.hostname, self.momC.shortname,
  3779. self.momE.hostname, self.momE.hostname, self.momE.shortname)
  3780. self.logger.info("expected out=%s" % (expected_out,))
  3781. job_out = ""
  3782. with open(job_output_file, 'r') as fd:
  3783. job_out = fd.read()
  3784. self.logger.info("job_out=%s" % (job_out,))
  3785. self.assertEquals(job_out, expected_out)
  3786. def test_t15(self):
  3787. """
  3788. Test: tolerating job_start of no node failures,
  3789. but pruning job's assigned nodes to satisfy the original
  3790. select spec minus 1 node, except one of the chunks is.
  3791. unsatisfiable. This time, the action pbs.event().delete()
  3792. action is specified on a failure to prune the job.
  3793. """
  3794. # instantiate queuejob hook
  3795. hook_event = "queuejob"
  3796. hook_name = "qjob"
  3797. a = {'event': hook_event, 'enabled': 'true'}
  3798. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  3799. # instantiate execjob_launch hook
  3800. hook_body = """
  3801. import pbs
  3802. e=pbs.event()
  3803. if 'PBS_NODEFILE' not in e.env:
  3804. e.accept()
  3805. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  3806. for vn in e.vnode_list:
  3807. v = e.vnode_list[vn]
  3808. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  3809. for vn in e.vnode_list_fail:
  3810. v = e.vnode_list_fail[vn]
  3811. pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
  3812. if e.job.in_ms_mom():
  3813. new_jsel ="ncpus=3:mem=2gb+ncpus=5:mem=3gb"
  3814. pj = e.job.release_nodes(keep_select=new_jsel)
  3815. pbs.logmsg(pbs.LOG_DEBUG, "release_nodes(keep_select=%s)" % (new_jsel,))
  3816. if pj != None:
  3817. pbs.logjobmsg(e.job.id, "launch: job.exec_vnode=%s" % (pj.exec_vnode,))
  3818. pbs.logjobmsg(e.job.id, "launch: job.exec_host=%s" % (pj.exec_host,))
  3819. pbs.logjobmsg(e.job.id,
  3820. "launch: job.schedselect=%s" % (pj.schedselect,))
  3821. else:
  3822. e.job.delete()
  3823. msg = "unsuccessful at LAUNCH"
  3824. e.reject("unsuccessful at LAUNCH")
  3825. """
  3826. hook_event = "execjob_launch"
  3827. hook_name = "launch"
  3828. a = {'event': hook_event, 'enabled': 'true'}
  3829. self.server.create_import_hook(hook_name, a, hook_body)
  3830. # First, turn off scheduling
  3831. a = {'scheduling': 'false'}
  3832. self.server.manager(MGR_CMD_SET, SERVER, a)
  3833. jid = self.create_and_submit_job('job1_4')
  3834. # Job gets queued and reflects the incremented values from queuejob
  3835. # hook
  3836. self.server.expect(JOB, {'job_state': 'Q',
  3837. 'tolerate_node_failures': 'job_start',
  3838. 'Resource_List.mem': '10gb',
  3839. 'Resource_List.ncpus': 13,
  3840. 'Resource_List.nodect': 5,
  3841. 'Resource_List.select': self.job1_iselect,
  3842. 'Resource_List.site': self.job1_oselect,
  3843. 'Resource_List.place': self.job1_place,
  3844. 'schedselect': self.job1_ischedselect},
  3845. id=jid, attrop=PTL_AND)
  3846. a = {'scheduling': 'true'}
  3847. self.server.manager(MGR_CMD_SET, SERVER, a)
  3848. self.momA.log_match("Job;%s;could not satisfy select chunk" % (jid,),
  3849. n=10, max_attempts=60, interval=2)
  3850. self.momA.log_match("Job;%s;NEED chunks for keep_select" % (jid,),
  3851. n=10)
  3852. self.momA.log_match(
  3853. "Job;%s;HAVE chunks from job's exec_vnode" % (jid,), n=10)
  3854. self.momA.log_match("execjob_launch request rejected by 'launch'",
  3855. n=10)
  3856. errmsg = "unsuccessful at LAUNCH"
  3857. self.momA.log_match("Job;%s;%s" % (jid, errmsg,), n=10)
  3858. self.server.expect(JOB, 'queue', op=UNSET, id=jid)
  3859. def test_t16(self):
  3860. """
  3861. Test: tolerating node failures at job startup with
  3862. a job submitted with -l place="scatter:excl".
  3863. Like jobs submitted with only "-l place=scatter"
  3864. except the vnodes assigned would have a
  3865. "job-exclusive" state.
  3866. """
  3867. # instantiate queuejob hook
  3868. hook_event = "queuejob"
  3869. hook_name = "qjob"
  3870. a = {'event': hook_event, 'enabled': 'true'}
  3871. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  3872. # instantiate execjob_begin hook
  3873. hook_body = """
  3874. import pbs
  3875. e=pbs.event()
  3876. pbs.logmsg(pbs.LOG_DEBUG, "Executing begin")
  3877. localnode=pbs.get_local_nodename()
  3878. if not e.job.in_ms_mom() and (localnode == '%s'):
  3879. x
  3880. """ % (self.nB,)
  3881. hook_event = "execjob_begin"
  3882. hook_name = "begin"
  3883. a = {'event': hook_event, 'enabled': 'true'}
  3884. self.server.create_import_hook(hook_name, a, hook_body)
  3885. # instantiate execjob_prologue hook
  3886. hook_event = "execjob_prologue"
  3887. hook_name = "prolo"
  3888. a = {'event': hook_event, 'enabled': 'true'}
  3889. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  3890. # instantiate execjob_launch hook
  3891. hook_event = "execjob_launch"
  3892. hook_name = "launch"
  3893. a = {'event': hook_event, 'enabled': 'true'}
  3894. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  3895. # First, turn off scheduling
  3896. a = {'scheduling': 'false'}
  3897. self.server.manager(MGR_CMD_SET, SERVER, a)
  3898. jid = self.create_and_submit_job('job4')
  3899. # Job gets queued and reflects the incremented values from queuejob
  3900. # hook
  3901. self.server.expect(JOB, {'job_state': 'Q',
  3902. 'tolerate_node_failures': 'job_start',
  3903. 'Resource_List.mem': '10gb',
  3904. 'Resource_List.ncpus': 13,
  3905. 'Resource_List.nodect': 5,
  3906. 'Resource_List.select': self.job4_iselect,
  3907. 'Resource_List.site': self.job4_oselect,
  3908. 'Resource_List.place': self.job4_place,
  3909. 'schedselect': self.job4_ischedselect},
  3910. max_attempts=10, id=jid, attrop=PTL_AND)
  3911. a = {'scheduling': 'true'}
  3912. self.server.manager(MGR_CMD_SET, SERVER, a)
  3913. # Job eventually launches reflecting the pruned back values
  3914. # to the original select spec
  3915. # There's a max_attempts=60 for it would take up to 60 seconds
  3916. # for primary mom to wait for the sisters to join
  3917. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  3918. # sisters to execjob_prologue hooks (default $job_launch_delay
  3919. # value of 30 seconds)
  3920. self.server.expect(JOB, {'job_state': 'R',
  3921. 'tolerate_node_failures': 'job_start',
  3922. 'Resource_List.mem': '6gb',
  3923. 'Resource_List.ncpus': 8,
  3924. 'Resource_List.nodect': 3,
  3925. 'Resource_List.select': self.job4_select,
  3926. 'Resource_List.place': self.job4_place,
  3927. 'schedselect': self.job4_schedselect,
  3928. 'exec_host': self.job4_exec_host,
  3929. 'exec_vnode': self.job4_exec_vnode},
  3930. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  3931. # Check various vnode status.
  3932. jobs_assn1 = "%s/0" % (jid,)
  3933. self.match_vnode_status([self.nAv0, self.nAv1],
  3934. 'job-exclusive', jobs_assn1, 1, '1048576kb')
  3935. self.match_vnode_status([self.nE, self.nEv0],
  3936. 'job-exclusive', jobs_assn1, 1, '1048576kb')
  3937. self.match_vnode_status([self.nAv2],
  3938. 'job-exclusive', jobs_assn1, 1, '0kb')
  3939. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  3940. self.match_vnode_status([self.nD], 'job-exclusive', jobs_assn3,
  3941. 3, '2097152kb')
  3942. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  3943. self.nBv1, self.nBv2, self.nBv3, self.nC,
  3944. self.nEv1, self.nEv2, self.nEv3], 'free')
  3945. # Check server/queue counts.
  3946. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  3947. 'resources_assigned.mem': '6291456kb'},
  3948. attrop=PTL_AND)
  3949. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  3950. 'resources_assigned.mem': '6291456kb'},
  3951. id='workq', attrop=PTL_AND)
  3952. self.assertTrue(
  3953. self.pbs_nodefile_match_exec_host(jid, self.job4_exec_host))
  3954. # Verify mom_logs
  3955. self.momA.log_match(
  3956. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  3957. jid, self.hostB), n=10, regexp=True)
  3958. self.momA.log_match(
  3959. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  3960. "is tolerant of node failures",
  3961. regexp=True, n=10)
  3962. self.momA.log_match(
  3963. "Job;%s;job_start_error.+from node %s.+" % (jid, self.hostC) +
  3964. "could not IM_EXEC_PROLOGUE", n=10, regexp=True)
  3965. self.momA.log_match(
  3966. "Job;%s;ignoring error from %s.+" % (jid, self.hostC) +
  3967. "as job is tolerant of node failures", n=10, regexp=True)
  3968. # Check vnode_list[] parameter in execjob_prologue hook
  3969. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3970. self.nB, self.nBv0, self.nBv1,
  3971. self.nC, self.nD, self.nE, self.nEv0]
  3972. for vn in vnode_list:
  3973. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  3974. jid, vn), n=10)
  3975. # Check vnode_list_fail[] parameter in execjob_prologue hook
  3976. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  3977. for vn in vnode_list_fail:
  3978. self.momA.log_match("Job;%s;prolo: found vnode_list_fail[%s]" % (
  3979. jid, vn), n=10)
  3980. # Check vnode_list[] parameter in execjob_launch hook
  3981. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  3982. self.nB, self.nBv0, self.nBv1,
  3983. self.nC, self.nD, self.nE, self.nEv0]
  3984. for vn in vnode_list:
  3985. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  3986. jid, vn), n=10)
  3987. # Check vnode_list_fail[] parameter in execjob_launch hook
  3988. vnode_list_fail = [self.nB, self.nBv0, self.nBv1, self.nC]
  3989. for vn in vnode_list_fail:
  3990. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  3991. jid, vn), n=10)
  3992. # Check result of pbs.event().job.release_nodes(keep_select) call
  3993. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  3994. jid, self.job4_exec_vnode), n=10)
  3995. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  3996. jid, self.job4_schedselect), n=10)
  3997. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  3998. jid, self.job4_iexec_vnode), n=10)
  3999. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  4000. jid, self.job4_exec_vnode), n=10)
  4001. # Check accounting_logs
  4002. self.match_accounting_log('S', jid, self.job4_iexec_host_esc,
  4003. self.job4_iexec_vnode_esc, "10gb", 13, 5,
  4004. self.job4_place,
  4005. self.job4_isel_esc)
  4006. self.match_accounting_log('s', jid, self.job4_exec_host_esc,
  4007. self.job4_exec_vnode_esc,
  4008. "6gb", 8, 3,
  4009. self.job4_place,
  4010. self.job4_sel_esc)
  4011. def test_t17(self):
  4012. """
  4013. Test: tolerating 1 node failure at job startup with
  4014. a job submitted with -l place="free".
  4015. Like jobs submitted with only "-l place=scatter"
  4016. except some vnodes from the same mom would get
  4017. allocated to satisfy different chunks.
  4018. This test breaks apart one of the multi-chunks of
  4019. the form (b1+b2+b3) so that upon reassignment,
  4020. (b1+b2) is used.
  4021. """
  4022. # instantiate queuejob hook
  4023. hook_event = "queuejob"
  4024. hook_name = "qjob"
  4025. a = {'event': hook_event, 'enabled': 'true'}
  4026. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  4027. # instantiate execjob_begin hook
  4028. hook_event = "execjob_begin"
  4029. hook_name = "begin"
  4030. a = {'event': hook_event, 'enabled': 'true'}
  4031. self.server.create_import_hook(hook_name, a, self.begin_hook_body4)
  4032. # instantiate execjob_prologue hook
  4033. hook_event = "execjob_prologue"
  4034. hook_name = "prolo"
  4035. a = {'event': hook_event, 'enabled': 'true'}
  4036. self.server.create_import_hook(hook_name, a, self.prolo_hook_body3)
  4037. # instantiate execjob_launch hook
  4038. hook_event = "execjob_launch"
  4039. hook_name = "launch"
  4040. a = {'event': hook_event, 'enabled': 'true'}
  4041. self.server.create_import_hook(hook_name, a, self.launch_hook_body2)
  4042. # First, turn off scheduling
  4043. a = {'scheduling': 'false'}
  4044. self.server.manager(MGR_CMD_SET, SERVER, a)
  4045. jid = self.create_and_submit_job('job5')
  4046. # Job gets queued and reflects the incremented values from queuejob
  4047. # hook
  4048. self.server.expect(JOB, {'job_state': 'Q',
  4049. 'tolerate_node_failures': 'job_start',
  4050. 'Resource_List.mem': '10gb',
  4051. 'Resource_List.ncpus': 13,
  4052. 'Resource_List.nodect': 5,
  4053. 'Resource_List.select': self.job5_iselect,
  4054. 'Resource_List.site': self.job5_oselect,
  4055. 'Resource_List.place': self.job5_place,
  4056. 'schedselect': self.job5_ischedselect},
  4057. max_attempts=10, id=jid, attrop=PTL_AND)
  4058. a = {'scheduling': 'true'}
  4059. self.server.manager(MGR_CMD_SET, SERVER, a)
  4060. # Job eventually launches reflecting the pruned back values
  4061. # to the original select spec
  4062. # There's a max_attempts=60 for it would take up to 60 seconds
  4063. # for primary mom to wait for the sisters to join
  4064. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  4065. # sisters to execjob_prologue hooks (default $job_launch_delay
  4066. # value of 30 seconds)
  4067. self.server.expect(JOB, {'job_state': 'R',
  4068. 'tolerate_node_failures': 'job_start',
  4069. 'Resource_List.mem': '5gb',
  4070. 'Resource_List.ncpus': 7,
  4071. 'Resource_List.nodect': 3,
  4072. 'Resource_List.select': self.job5_select,
  4073. 'Resource_List.place': self.job5_place,
  4074. 'schedselect': self.job5_schedselect,
  4075. 'exec_host': self.job5_exec_host,
  4076. 'exec_vnode': self.job5_exec_vnode},
  4077. id=jid, interval=1, attrop=PTL_AND, max_attempts=60)
  4078. # Check various vnode status.
  4079. jobs_assn1 = "%s/0" % (jid,)
  4080. self.match_vnode_status(
  4081. [self.nAv0, self.nAv1, self.nB, self.nBv0],
  4082. 'job-busy', jobs_assn1, 1, '1048576kb')
  4083. self.match_vnode_status([self.nAv2, self.nBv2],
  4084. 'job-busy', jobs_assn1, 1, '0kb')
  4085. # due to free placement, job appears twice as it's been allocated
  4086. # twice, one for mem only and the other for ncpus
  4087. jobs_assn2 = "%s/0, %s/0" % (jid, jid)
  4088. self.match_vnode_status([self.nBv1],
  4089. 'job-busy', jobs_assn2, 1, '1048576kb')
  4090. self.match_vnode_status([self.nA, self.nAv3, self.nC, self.nD,
  4091. self.nD, self.nE, self.nEv0, self.nEv1,
  4092. self.nEv2, self.nEv3, self.nBv3], 'free')
  4093. # Check server/queue counts.
  4094. self.server.expect(SERVER, {'resources_assigned.ncpus': 7,
  4095. 'resources_assigned.mem': '5242880kb'},
  4096. attrop=PTL_AND)
  4097. self.server.expect(QUEUE, {'resources_assigned.ncpus': 7,
  4098. 'resources_assigned.mem': '5242880kb'},
  4099. id='workq', attrop=PTL_AND)
  4100. self.assertTrue(
  4101. self.pbs_nodefile_match_exec_host(jid, self.job5_exec_host))
  4102. # Verify mom_logs
  4103. self.momA.log_match(
  4104. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  4105. jid, self.hostD), n=10, regexp=True)
  4106. self.momA.log_match(
  4107. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostD) +
  4108. "is tolerant of node failures",
  4109. regexp=True, n=10)
  4110. # Check vnode_list[] parameter in execjob_prologue hook
  4111. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  4112. self.nB, self.nBv0, self.nBv1,
  4113. self.nC, self.nD]
  4114. for vn in vnode_list:
  4115. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  4116. jid, vn), n=10)
  4117. # Check vnode_list[] parameter in execjob_launch hook
  4118. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  4119. self.nB, self.nBv0, self.nBv1,
  4120. self.nC, self.nD]
  4121. for vn in vnode_list:
  4122. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  4123. jid, vn), n=10)
  4124. # Check vnode_list_fail[] parameter in execjob_launch hook
  4125. vnode_list_fail = [self.nD]
  4126. for vn in vnode_list_fail:
  4127. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  4128. jid, vn), n=10)
  4129. # Check result of pbs.event().job.release_nodes(keep_select) call
  4130. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  4131. jid, self.job5_exec_vnode), n=10)
  4132. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  4133. jid, self.job5_schedselect), n=10)
  4134. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  4135. jid, self.job5_iexec_vnode), n=10)
  4136. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  4137. jid, self.job5_exec_vnode), n=10)
  4138. # Check accounting_logs
  4139. self.match_accounting_log('S', jid, self.job5_iexec_host_esc,
  4140. self.job5_iexec_vnode_esc, "10gb", 13, 5,
  4141. self.job5_place,
  4142. self.job5_isel_esc)
  4143. self.match_accounting_log('s', jid, self.job5_exec_host_esc,
  4144. self.job5_exec_vnode_esc,
  4145. "5gb", 7, 3,
  4146. self.job5_place,
  4147. self.job5_sel_esc)
  4148. def test_t18(self):
  4149. """
  4150. Test: having a node failure tolerant job waiting for healthy nodes
  4151. to get rerun (i.e. qrerun). Upon qrerun, job should get
  4152. killed, requeued, and restarted.
  4153. """
  4154. # instantiate queuejob hook
  4155. hook_event = "queuejob"
  4156. hook_name = "qjob"
  4157. a = {'event': hook_event, 'enabled': 'true'}
  4158. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  4159. # instantiate execjob_begin hook
  4160. hook_event = "execjob_begin"
  4161. hook_name = "begin"
  4162. a = {'event': hook_event, 'enabled': 'true'}
  4163. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  4164. # instantiate execjob_launch hook
  4165. hook_event = "execjob_launch"
  4166. hook_name = "launch"
  4167. a = {'event': hook_event, 'enabled': 'true'}
  4168. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  4169. jid = self.create_and_submit_job('job1')
  4170. # job's substate is 41 (PRERUN) since it would be waiting for
  4171. # healthy nodes being a node failure tolerant job.
  4172. # With no prologue hook, MS would wait the default 30
  4173. # seconds for healthy nodes.
  4174. self.server.expect(JOB, {'job_state': 'R',
  4175. 'substate': 41,
  4176. 'tolerate_node_failures': 'job_start',
  4177. 'Resource_List.mem': '10gb',
  4178. 'Resource_List.ncpus': 13,
  4179. 'Resource_List.nodect': 5,
  4180. 'exec_host': self.job1_iexec_host,
  4181. 'exec_vnode': self.job1_iexec_vnode,
  4182. 'Resource_List.select': self.job1_iselect,
  4183. 'Resource_List.site': self.job1_oselect,
  4184. 'Resource_List.place': self.job1_place,
  4185. 'schedselect': self.job1_ischedselect},
  4186. id=jid, attrop=PTL_AND)
  4187. # Check various vnode status.
  4188. jobs_assn1 = "%s/0" % (jid,)
  4189. self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0,
  4190. self.nE, self.nEv0],
  4191. 'job-busy', jobs_assn1, 1, '1048576kb')
  4192. self.match_vnode_status([self.nAv2, self.nBv1],
  4193. 'job-busy', jobs_assn1, 1, '0kb')
  4194. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  4195. self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
  4196. 2, '2097152kb')
  4197. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  4198. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  4199. 3, '2097152kb')
  4200. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  4201. self.nEv1, self.nEv2, self.nEv3], 'free')
  4202. # check server/queue counts
  4203. self.server.expect(SERVER, {'resources_assigned.ncpus': 13,
  4204. 'resources_assigned.mem': '10485760'},
  4205. attrop=PTL_AND)
  4206. self.server.expect(QUEUE, {'resources_assigned.ncpus': 13,
  4207. 'resources_assigned.mem': '10485760'},
  4208. id='workq', attrop=PTL_AND)
  4209. # Verify mom_logs
  4210. self.momA.log_match(
  4211. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  4212. jid, self.hostB), n=10, regexp=True)
  4213. self.momA.log_match(
  4214. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  4215. "is tolerant of node failures",
  4216. regexp=True, n=10)
  4217. a = {'scheduling': 'false'}
  4218. self.server.manager(MGR_CMD_SET, SERVER, a)
  4219. self.server.rerunjob(jid)
  4220. self.server.expect(JOB, {'job_state': 'Q'}, id=jid)
  4221. self.match_vnode_status([self.nA, self.nAv0, self.nAv1, self.nAv2,
  4222. self.nAv3,
  4223. self.nB, self.nBv0, self.nBv1, self.nBv2,
  4224. self.nBv3, self.nC, self.nD, self.nE,
  4225. self.nEv0, self.nEv1, self.nEv2,
  4226. self.nEv3], 'free')
  4227. # check server/queue counts
  4228. self.server.expect(SERVER, {'resources_assigned.ncpus': 0,
  4229. 'resources_assigned.mem': '0kb'},
  4230. attrop=PTL_AND)
  4231. self.server.expect(QUEUE, {'resources_assigned.ncpus': 0,
  4232. 'resources_assigned.mem': '0kb'},
  4233. id='workq', attrop=PTL_AND)
  4234. a = {'scheduling': 'true'}
  4235. self.server.manager(MGR_CMD_SET, SERVER, a)
  4236. # Now job should start running again
  4237. self.server.expect(JOB, {'job_state': 'R',
  4238. 'tolerate_node_failures': 'job_start',
  4239. 'Resource_List.mem': '6gb',
  4240. 'Resource_List.ncpus': 8,
  4241. 'Resource_List.nodect': 3,
  4242. 'Resource_List.select': self.job1v2_select,
  4243. 'Resource_List.place': self.job1_place,
  4244. 'schedselect': self.job1v2_schedselect,
  4245. 'exec_host': self.job1v2_exec_host,
  4246. 'exec_vnode': self.job1v2_exec_vnode},
  4247. id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
  4248. thisjob = self.server.status(JOB, id=jid)
  4249. if thisjob:
  4250. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  4251. # Check various vnode status.
  4252. jobs_assn1 = "%s/0" % (jid,)
  4253. self.match_vnode_status([self.nAv0, self.nAv1],
  4254. 'job-busy', jobs_assn1, 1, '1048576kb')
  4255. self.match_vnode_status([self.nAv2],
  4256. 'job-busy', jobs_assn1, 1, '0kb')
  4257. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  4258. self.match_vnode_status([self.nC], 'job-busy', jobs_assn2,
  4259. 2, '2097152kb')
  4260. jobs_assn3 = "%s/0, %s/1, %s/2" % (jid, jid, jid)
  4261. self.match_vnode_status([self.nD], 'free', jobs_assn3,
  4262. 3, '2097152kb')
  4263. self.match_vnode_status([self.nA, self.nAv3, self.nB, self.nBv0,
  4264. self.nBv1, self.nBv2, self.nBv3, self.nE,
  4265. self.nEv0, self.nEv1, self.nEv2,
  4266. self.nEv3], 'free')
  4267. # check server/queue counts
  4268. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  4269. 'resources_assigned.mem': '6291456kb'},
  4270. attrop=PTL_AND)
  4271. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  4272. 'resources_assigned.mem': '6291456kb'},
  4273. id='workq', attrop=PTL_AND)
  4274. self.assertTrue(
  4275. self.pbs_nodefile_match_exec_host(jid, self.job1v2_exec_host))
  4276. # Verify mom_logs
  4277. self.momA.log_match(
  4278. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  4279. jid, self.hostB), n=10, regexp=True)
  4280. self.momA.log_match(
  4281. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  4282. "is tolerant of node failures",
  4283. regexp=True, n=10)
  4284. # Check vnode_list[] parameter in execjob_launch hook
  4285. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  4286. self.nB, self.nBv0, self.nBv1,
  4287. self.nC, self.nD, self.nE, self.nEv0]
  4288. for vn in vnode_list:
  4289. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  4290. jid, vn), n=10)
  4291. # Check vnode_list_fail[] parameter in execjob_launch hook
  4292. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  4293. for vn in vnode_list_fail:
  4294. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  4295. jid, vn), n=10)
  4296. # Check result of pbs.event().job.release_nodes(keep_select) call
  4297. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  4298. jid, self.job1v2_exec_vnode), n=10)
  4299. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  4300. jid, self.job1v2_schedselect), n=10)
  4301. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  4302. jid, self.job1_iexec_vnode), n=10)
  4303. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  4304. jid, self.job1v2_exec_vnode), n=10)
  4305. # Check accounting_logs
  4306. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  4307. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  4308. self.job1_place,
  4309. self.job1_isel_esc)
  4310. self.match_accounting_log('s', jid, self.job1v2_exec_host_esc,
  4311. self.job1v2_exec_vnode_esc,
  4312. "6gb", 8, 3,
  4313. self.job1_place,
  4314. self.job1v2_sel_esc)
  4315. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  4316. n=10, max_attempts=60, interval=2, regexp=True)
  4317. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  4318. n=10, max_attempts=10, interval=2)
  4319. # validate output
  4320. expected_out = """/var/spool/pbs/aux/%s
  4321. %s
  4322. %s
  4323. %s
  4324. FIB TESTS
  4325. pbsdsh -n 1 fib 37
  4326. %d
  4327. pbsdsh -n 2 fib 37
  4328. %d
  4329. fib 37
  4330. %d
  4331. HOSTNAME TESTS
  4332. pbsdsh -n 0 hostname
  4333. %s
  4334. pbsdsh -n 1 hostname
  4335. %s
  4336. pbsdsh -n 2 hostname
  4337. %s
  4338. PBS_NODEFILE tests
  4339. HOST=%s
  4340. pbs_tmrsh %s hostname
  4341. %s
  4342. HOST=%s
  4343. pbs_tmrsh %s hostname
  4344. %s
  4345. HOST=%s
  4346. pbs_tmrsh %s hostname
  4347. %s
  4348. """ % (jid, self.momA.hostname, self.momD.hostname, self.momC.hostname,
  4349. self.fib37_value, self.fib37_value, self.fib37_value,
  4350. self.momA.shortname, self.momD.shortname, self.momC.shortname,
  4351. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  4352. self.momD.hostname, self.momD.hostname, self.momD.shortname,
  4353. self.momC.hostname, self.momC.hostname, self.momC.shortname)
  4354. job_out = ""
  4355. with open(job_output_file, 'r') as fd:
  4356. job_out = fd.read()
  4357. self.assertEquals(job_out, expected_out)
  4358. # Re-check vnode_list[] parameter in execjob_launch hook
  4359. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  4360. self.nB, self.nBv0, self.nBv1,
  4361. self.nC, self.nD, self.nE, self.nEv0]
  4362. for vn in vnode_list:
  4363. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  4364. jid, vn), n=10)
  4365. # Re-check vnode_list_fail[] parameter in execjob_launch hook
  4366. vnode_list_fail = [self.nB, self.nBv0, self.nBv1]
  4367. for vn in vnode_list_fail:
  4368. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  4369. jid, vn), n=10)
  4370. # Check result of pbs.event().job.release_nodes(keep_select) call
  4371. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  4372. jid, self.job1v2_exec_vnode), n=10)
  4373. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  4374. jid, self.job1v2_schedselect), n=10)
  4375. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  4376. jid, self.job1_iexec_vnode), n=10)
  4377. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  4378. jid, self.job1v2_exec_vnode), n=10)
  4379. # Check accounting_logs
  4380. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  4381. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  4382. self.job1_place,
  4383. self.job1_isel_esc)
  4384. self.match_accounting_log('s', jid, self.job1v2_exec_host_esc,
  4385. self.job1v2_exec_vnode_esc,
  4386. "6gb", 8, 3,
  4387. self.job1_place,
  4388. self.job1v2_sel_esc)
  4389. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  4390. n=10, max_attempts=60, interval=2, regexp=True)
  4391. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  4392. n=10, max_attempts=10, interval=2)
  4393. # validate output
  4394. expected_out = """/var/spool/pbs/aux/%s
  4395. %s
  4396. %s
  4397. %s
  4398. FIB TESTS
  4399. pbsdsh -n 1 fib 37
  4400. %d
  4401. pbsdsh -n 2 fib 37
  4402. %d
  4403. fib 37
  4404. %d
  4405. HOSTNAME TESTS
  4406. pbsdsh -n 0 hostname
  4407. %s
  4408. pbsdsh -n 1 hostname
  4409. %s
  4410. pbsdsh -n 2 hostname
  4411. %s
  4412. PBS_NODEFILE tests
  4413. HOST=%s
  4414. pbs_tmrsh %s hostname
  4415. %s
  4416. HOST=%s
  4417. pbs_tmrsh %s hostname
  4418. %s
  4419. HOST=%s
  4420. pbs_tmrsh %s hostname
  4421. %s
  4422. """ % (jid, self.momA.hostname, self.momD.hostname, self.momC.hostname,
  4423. self.fib37_value, self.fib37_value, self.fib37_value,
  4424. self.momA.shortname, self.momD.shortname, self.momC.shortname,
  4425. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  4426. self.momD.hostname, self.momD.hostname, self.momD.shortname,
  4427. self.momC.hostname, self.momC.hostname, self.momC.shortname)
  4428. job_out = ""
  4429. with open(job_output_file, 'r') as fd:
  4430. job_out = fd.read()
  4431. self.assertEquals(job_out, expected_out)
  4432. def test_t19(self):
  4433. """
  4434. Test: having a node tolerant job waiting for healthy nodes
  4435. to get issued a request to release nodes. The call
  4436. to pbs_release_nodes would fail given that the job
  4437. is not fully running yet, still figuring out which nodes
  4438. assigned are deemed good.
  4439. """
  4440. # instantiate queuejob hook
  4441. hook_event = "queuejob"
  4442. hook_name = "qjob"
  4443. a = {'event': hook_event, 'enabled': 'true'}
  4444. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  4445. # instantiate execjob_begin hook
  4446. hook_event = "execjob_begin"
  4447. hook_name = "begin"
  4448. a = {'event': hook_event, 'enabled': 'true'}
  4449. self.server.create_import_hook(hook_name, a, self.begin_hook_body)
  4450. # instantiate execjob_launch hook
  4451. hook_event = "execjob_launch"
  4452. hook_name = "launch"
  4453. a = {'event': hook_event, 'enabled': 'true'}
  4454. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  4455. jid = self.create_and_submit_job('job1')
  4456. # job's substate is 41 (PRERUN) since it would be waiting for
  4457. # healthy nodes being a node failure tolerant job
  4458. self.server.expect(JOB, {'job_state': 'R',
  4459. 'substate': 41,
  4460. 'tolerate_node_failures': 'job_start',
  4461. 'Resource_List.mem': '10gb',
  4462. 'Resource_List.ncpus': 13,
  4463. 'Resource_List.nodect': 5,
  4464. 'exec_host': self.job1_iexec_host,
  4465. 'exec_vnode': self.job1_iexec_vnode,
  4466. 'Resource_List.select': self.job1_iselect,
  4467. 'Resource_List.site': self.job1_oselect,
  4468. 'Resource_List.place': self.job1_place,
  4469. 'schedselect': self.job1_ischedselect},
  4470. id=jid, attrop=PTL_AND)
  4471. # Verify mom_logs
  4472. self.momA.log_match(
  4473. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  4474. jid, self.hostB), n=10, regexp=True)
  4475. self.momA.log_match(
  4476. "Job;%s;ignoring error from %s.+as job " % (jid, self.hostB) +
  4477. "is tolerant of node failures",
  4478. regexp=True, n=10)
  4479. # Run pbs_release_nodes on a job whose state is running but
  4480. # substate is under PRERUN
  4481. pbs_release_nodes_cmd = os.path.join(
  4482. self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes')
  4483. cmd = [pbs_release_nodes_cmd, '-j', jid, '-a']
  4484. ret = self.server.du.run_cmd(self.server.hostname, cmd,
  4485. runas=TEST_USER)
  4486. self.assertNotEqual(ret['rc'], 0)
  4487. self.assertTrue(ret['err'][0].startswith(
  4488. 'pbs_release_nodes: Request invalid for state of job'))
  4489. def test_t20(self):
  4490. """
  4491. Test: node failure tolerant job array, with multiple subjobs
  4492. starting at the same time, and job's assigned resources
  4493. are pruned to match up to the original select spec using
  4494. an execjob_prologue hook this time.
  4495. """
  4496. # instantiate queuejob hook
  4497. hook_event = "queuejob"
  4498. hook_name = "qjob"
  4499. a = {'event': hook_event, 'enabled': 'true'}
  4500. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  4501. # instantiate execjob_begin hook
  4502. hook_event = "execjob_begin"
  4503. hook_name = "begin"
  4504. a = {'event': hook_event, 'enabled': 'true'}
  4505. self.server.create_import_hook(hook_name, a, self.begin_hook_body5)
  4506. # instantiate execjob_prologue hook
  4507. hook_event = "execjob_prologue"
  4508. hook_name = "prolo"
  4509. a = {'event': hook_event, 'enabled': 'true'}
  4510. self.server.create_import_hook(hook_name, a, self.prolo_hook_body4)
  4511. # First, turn off scheduling
  4512. a = {'scheduling': 'false'}
  4513. self.server.manager(MGR_CMD_SET, SERVER, a)
  4514. jid = self.create_and_submit_job('jobA')
  4515. # Job gets queued and reflects the incremented values from queuejob
  4516. # hook
  4517. self.server.expect(JOB, {'job_state': 'Q',
  4518. 'tolerate_node_failures': 'job_start',
  4519. 'Resource_List.mem': '5gb',
  4520. 'Resource_List.ncpus': 5,
  4521. 'Resource_List.nodect': 5,
  4522. 'Resource_List.select': self.jobA_iselect,
  4523. 'Resource_List.site': self.jobA_oselect,
  4524. 'Resource_List.place': self.jobA_place,
  4525. 'schedselect': self.jobA_ischedselect},
  4526. id=jid, attrop=PTL_AND)
  4527. a = {'scheduling': 'true'}
  4528. self.server.manager(MGR_CMD_SET, SERVER, a)
  4529. self.server.expect(JOB, {'job_state': 'B',
  4530. 'tolerate_node_failures': 'job_start',
  4531. 'Resource_List.mem': '5gb',
  4532. 'Resource_List.ncpus': 5,
  4533. 'Resource_List.nodect': 5,
  4534. 'Resource_List.select': self.jobA_iselect,
  4535. 'Resource_List.site': self.jobA_oselect,
  4536. 'Resource_List.place': self.jobA_place,
  4537. 'schedselect': self.jobA_ischedselect},
  4538. id=jid, attrop=PTL_AND)
  4539. self.server.expect(JOB, {'job_state=R': 3}, extend='t')
  4540. for idx in range(1, 4):
  4541. sjid = create_subjob_id(jid, idx)
  4542. if idx == 1:
  4543. iexec_host_esc = self.jobA_iexec_host1_esc
  4544. iexec_vnode = self.jobA_iexec_vnode1
  4545. iexec_vnode_esc = self.jobA_iexec_vnode1_esc
  4546. exec_host = self.jobA_exec_host1
  4547. exec_host_esc = self.jobA_exec_host1_esc
  4548. exec_vnode = self.jobA_exec_vnode1
  4549. exec_vnode_esc = self.jobA_exec_vnode1_esc
  4550. vnode_list = [self.nAv0, self.nB, self.nC,
  4551. self.nD, self.nE]
  4552. elif idx == 2:
  4553. iexec_host_esc = self.jobA_iexec_host2_esc
  4554. iexec_vnode = self.jobA_iexec_vnode2
  4555. iexec_vnode_esc = self.jobA_iexec_vnode2_esc
  4556. exec_host = self.jobA_exec_host2
  4557. exec_host_esc = self.jobA_exec_host2_esc
  4558. exec_vnode = self.jobA_exec_vnode2
  4559. exec_vnode_esc = self.jobA_exec_vnode2_esc
  4560. vnode_list = [self.nAv1, self.nBv0, self.nC,
  4561. self.nD, self.nEv0]
  4562. elif idx == 3:
  4563. iexec_host_esc = self.jobA_iexec_host3_esc
  4564. iexec_vnode = self.jobA_iexec_vnode3
  4565. iexec_vnode_esc = self.jobA_iexec_vnode3_esc
  4566. exec_host = self.jobA_exec_host3
  4567. exec_host_esc = self.jobA_exec_host3_esc
  4568. exec_vnode = self.jobA_exec_vnode3
  4569. exec_vnode_esc = self.jobA_exec_vnode3_esc
  4570. vnode_list = [self.nAv2, self.nBv1, self.nC,
  4571. self.nD, self.nE]
  4572. self.server.expect(JOB, {'job_state': 'R',
  4573. 'substate': 41,
  4574. 'tolerate_node_failures': 'job_start',
  4575. 'Resource_List.mem': '3gb',
  4576. 'Resource_List.ncpus': 3,
  4577. 'Resource_List.nodect': 3,
  4578. 'exec_host': exec_host,
  4579. 'exec_vnode': exec_vnode,
  4580. 'Resource_List.select': self.jobA_select,
  4581. 'Resource_List.site': self.jobA_oselect,
  4582. 'Resource_List.place': self.jobA_place,
  4583. 'schedselect': self.jobA_schedselect},
  4584. id=sjid, attrop=PTL_AND)
  4585. # Verify mom_logs
  4586. sjid_esc = sjid.replace(
  4587. "[", "\[").replace("]", "\]").replace("(", "\(").replace(
  4588. ")", "\)").replace("+", "\+")
  4589. self.momA.log_match(
  4590. "Job;%s;job_start_error.+from node %s.+could not JOIN_JOB" % (
  4591. sjid_esc, self.hostC), n=10, regexp=True)
  4592. self.momA.log_match(
  4593. "Job;%s;ignoring error from %s.+as job " % (
  4594. sjid_esc, self.hostC) + "is tolerant of node failures",
  4595. regexp=True, n=10)
  4596. for vn in vnode_list:
  4597. self.momA.log_match("Job;%s;prolo: found vnode_list[%s]" % (
  4598. sjid, vn), n=10)
  4599. vnode_list_fail = [self.nC]
  4600. for vn in vnode_list_fail:
  4601. self.momA.log_match(
  4602. "Job;%s;prolo: found vnode_list_fail[%s]" % (
  4603. sjid, vn), n=10)
  4604. # Check result of pbs.event().job.release_nodes(keep_select)
  4605. # call
  4606. self.momA.log_match("Job;%s;prolo: job.exec_vnode=%s" % (
  4607. sjid, exec_vnode), n=10)
  4608. self.momA.log_match("Job;%s;prolo: job.schedselect=%s" % (
  4609. sjid, self.jobA_schedselect), n=10)
  4610. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  4611. sjid, iexec_vnode), n=10)
  4612. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  4613. sjid, exec_vnode), n=10)
  4614. # Check accounting_logs
  4615. self.match_accounting_log('S', sjid_esc, iexec_host_esc,
  4616. iexec_vnode_esc, "5gb", 5, 5,
  4617. self.jobA_place,
  4618. self.jobA_isel_esc)
  4619. self.match_accounting_log('s', sjid_esc, exec_host_esc,
  4620. exec_vnode_esc,
  4621. "3gb", 3, 3,
  4622. self.jobA_place,
  4623. self.jobA_sel_esc)
  4624. @timeout(400)
  4625. def test_t21(self):
  4626. """
  4627. Test: radio silent moms causing the primary mom to not get
  4628. any acks from the sister moms executing prologue hooks.
  4629. After some 'job_launch_delay' time has passed, primary
  4630. mom will consider node hosts that have not acknowledged
  4631. the prologue hook execution as failed hosts, and will
  4632. not use their vnodes in the pruning of jobs.
  4633. """
  4634. job_launch_delay = 120
  4635. c = {'$job_launch_delay': job_launch_delay}
  4636. self.momA.add_config(c)
  4637. # instantiate queuejob hook
  4638. hook_event = "queuejob"
  4639. hook_name = "qjob"
  4640. a = {'event': hook_event, 'enabled': 'true'}
  4641. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  4642. # instantiate execjob_prologue hook
  4643. hook_event = "execjob_prologue"
  4644. hook_name = "prolo"
  4645. a = {'event': hook_event, 'enabled': 'true', 'alarm': 60}
  4646. self.server.create_import_hook(hook_name, a, self.prolo_hook_body5)
  4647. # instantiate execjob_launch hook
  4648. hook_event = "execjob_launch"
  4649. hook_name = "launch"
  4650. a = {'event': hook_event, 'enabled': 'true'}
  4651. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  4652. # First, turn off scheduling
  4653. a = {'scheduling': 'false'}
  4654. self.server.manager(MGR_CMD_SET, SERVER, a)
  4655. jid = self.create_and_submit_job('job1')
  4656. # Job gets queued and reflects the incremented values from queuejob
  4657. # hook
  4658. self.server.expect(JOB, {'job_state': 'Q',
  4659. 'tolerate_node_failures': 'job_start',
  4660. 'Resource_List.mem': '10gb',
  4661. 'Resource_List.ncpus': 13,
  4662. 'Resource_List.nodect': 5,
  4663. 'Resource_List.select': self.job1_iselect,
  4664. 'Resource_List.site': self.job1_oselect,
  4665. 'Resource_List.place': self.job1_place,
  4666. 'schedselect': self.job1_ischedselect},
  4667. id=jid, attrop=PTL_AND)
  4668. a = {'scheduling': 'true'}
  4669. self.server.manager(MGR_CMD_SET, SERVER, a)
  4670. self.momE.log_match(
  4671. "Job;%s;sleeping for 30 secs" % (jid, ), n=10)
  4672. # temporarily suspend momE, simulating a radio silent mom.
  4673. self.momE.signal("-STOP")
  4674. self.momC.log_match(
  4675. "Job;%s;sleeping for 30 secs" % (jid, ), n=10)
  4676. # temporarily suspend momC, simulating a radio silent mom.
  4677. self.momC.signal("-STOP")
  4678. # sleep as long as the time primary mom waits for all
  4679. # prologue hook acknowledgement from the sister moms
  4680. self.logger.info("sleeping for %d secs waiting for healthy nodes" % (
  4681. job_launch_delay,))
  4682. time.sleep(job_launch_delay)
  4683. # Job eventually launches reflecting the pruned back values
  4684. # to the original select spec
  4685. # There's a max_attempts=60 for it would take up to 60 seconds
  4686. # for primary mom to wait for the sisters to join
  4687. # (default $sister_join_job_alarm of 30 seconds) and to wait for
  4688. # sisters to execjob_prologue hooks (default $job_launch_delay
  4689. # value of 30 seconds)
  4690. self.server.expect(JOB, {'job_state': 'R',
  4691. 'tolerate_node_failures': 'job_start',
  4692. 'Resource_List.mem': '6gb',
  4693. 'Resource_List.ncpus': 8,
  4694. 'Resource_List.nodect': 3,
  4695. 'Resource_List.select': self.job1v4_select,
  4696. 'Resource_List.place': self.job1_place,
  4697. 'schedselect': self.job1v4_schedselect,
  4698. 'exec_host': self.job1v4_exec_host,
  4699. 'exec_vnode': self.job1v4_exec_vnode},
  4700. id=jid, interval=1, attrop=PTL_AND, max_attempts=70)
  4701. thisjob = self.server.status(JOB, id=jid)
  4702. if thisjob:
  4703. job_output_file = thisjob[0]['Output_Path'].split(':')[1]
  4704. # Check various vnode status.
  4705. jobs_assn1 = "%s/0" % (jid,)
  4706. self.match_vnode_status([self.nAv0, self.nAv1, self.nB, self.nBv0],
  4707. 'job-busy', jobs_assn1, 1, '1048576kb')
  4708. jobs_assn2 = "%s/0, %s/1" % (jid, jid)
  4709. self.match_vnode_status([self.nD], 'free', jobs_assn2,
  4710. 2, '2097152kb')
  4711. self.match_vnode_status([self.nAv2, self.nBv1],
  4712. 'job-busy', jobs_assn1, 1, '0kb')
  4713. self.match_vnode_status([self.nA, self.nAv3, self.nBv2, self.nBv3,
  4714. self.nC, self.nD, self.nEv1, self.nEv2,
  4715. self.nEv3, self.nE, self.nEv0], 'free')
  4716. # check server/queue counts
  4717. self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
  4718. 'resources_assigned.mem': '6291456kb'},
  4719. attrop=PTL_AND)
  4720. self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
  4721. 'resources_assigned.mem': '6291456kb'},
  4722. id='workq', attrop=PTL_AND)
  4723. self.assertTrue(
  4724. self.pbs_nodefile_match_exec_host(jid, self.job1v4_exec_host))
  4725. # Check vnode_list[] parameter in execjob_launch hook
  4726. vnode_list = [self.nAv0, self.nAv1, self.nAv2,
  4727. self.nB, self.nBv0, self.nBv1,
  4728. self.nC, self.nD, self.nE, self.nEv0]
  4729. for vn in vnode_list:
  4730. self.momA.log_match("Job;%s;launch: found vnode_list[%s]" % (
  4731. jid, vn), n=10)
  4732. # Check vnode_list_fail[] parameter in execjob_launch hook
  4733. vnode_list_fail = [self.nC, self.nE, self.nEv0]
  4734. for vn in vnode_list_fail:
  4735. self.momA.log_match("Job;%s;launch: found vnode_list_fail[%s]" % (
  4736. jid, vn), n=10)
  4737. # Check result of pbs.event().job.release_nodes(keep_select) call
  4738. self.momA.log_match("Job;%s;launch: job.exec_vnode=%s" % (
  4739. jid, self.job1v4_exec_vnode), n=10)
  4740. self.momA.log_match("Job;%s;launch: job.schedselect=%s" % (
  4741. jid, self.job1v4_schedselect), n=10)
  4742. self.momA.log_match("Job;%s;pruned from exec_vnode=%s" % (
  4743. jid, self.job1_iexec_vnode), n=10)
  4744. self.momA.log_match("Job;%s;pruned to exec_vnode=%s" % (
  4745. jid, self.job1v4_exec_vnode), n=10)
  4746. # Check accounting_logs
  4747. self.match_accounting_log('S', jid, self.job1_iexec_host_esc,
  4748. self.job1_iexec_vnode_esc, "10gb", 13, 5,
  4749. self.job1_place,
  4750. self.job1_isel_esc)
  4751. self.match_accounting_log('s', jid, self.job1v4_exec_host_esc,
  4752. self.job1v4_exec_vnode_esc,
  4753. "6gb", 8, 3,
  4754. self.job1_place,
  4755. self.job1v4_sel_esc)
  4756. self.momA.log_match("Job;%s;task.+started, hostname" % (jid,),
  4757. n=10, max_attempts=60, interval=2, regexp=True)
  4758. self.momA.log_match("Job;%s;copy file request received" % (jid,),
  4759. n=10, max_attempts=10, interval=2)
  4760. # validate output
  4761. expected_out = """/var/spool/pbs/aux/%s
  4762. %s
  4763. %s
  4764. %s
  4765. FIB TESTS
  4766. pbsdsh -n 1 fib 37
  4767. %d
  4768. pbsdsh -n 2 fib 37
  4769. %d
  4770. fib 37
  4771. %d
  4772. HOSTNAME TESTS
  4773. pbsdsh -n 0 hostname
  4774. %s
  4775. pbsdsh -n 1 hostname
  4776. %s
  4777. pbsdsh -n 2 hostname
  4778. %s
  4779. PBS_NODEFILE tests
  4780. HOST=%s
  4781. pbs_tmrsh %s hostname
  4782. %s
  4783. HOST=%s
  4784. pbs_tmrsh %s hostname
  4785. %s
  4786. HOST=%s
  4787. pbs_tmrsh %s hostname
  4788. %s
  4789. """ % (jid, self.momA.hostname, self.momB.hostname, self.momD.hostname,
  4790. self.fib37_value, self.fib37_value, self.fib37_value,
  4791. self.momA.shortname, self.momB.shortname, self.momD.shortname,
  4792. self.momA.hostname, self.momA.hostname, self.momA.shortname,
  4793. self.momB.hostname, self.momB.hostname, self.momB.shortname,
  4794. self.momD.hostname, self.momD.hostname, self.momD.shortname)
  4795. job_out = ""
  4796. with open(job_output_file, 'r') as fd:
  4797. job_out = fd.read()
  4798. self.assertEquals(job_out, expected_out)