6 Star 3 Fork 5

Gitee 极速下载/repmgr

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/2ndQuadrant/repmgr/
克隆/下载
repmgrd-physical.c 160.27 KB
一键复制 编辑 原始数据 按行查看 历史
Mario Gonzalez 提交于 2024-11-13 23:10 +08:00 . Fix shadowed declaration
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628
/*
* repmgrd-physical.c - physical (streaming) replication functionality for repmgrd
*
* Copyright (c) EnterpriseDB Corporation, 2010-2021
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <signal.h>
#include "repmgr.h"
#include "repmgrd.h"
#include "repmgrd-physical.h"
typedef enum
{
FAILOVER_STATE_UNKNOWN = -1,
FAILOVER_STATE_NONE,
FAILOVER_STATE_PROMOTED,
FAILOVER_STATE_PROMOTION_FAILED,
FAILOVER_STATE_PRIMARY_REAPPEARED,
FAILOVER_STATE_LOCAL_NODE_FAILURE,
FAILOVER_STATE_WAITING_NEW_PRIMARY,
FAILOVER_STATE_FOLLOW_NEW_PRIMARY,
FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER,
FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
FAILOVER_STATE_NO_NEW_PRIMARY,
FAILOVER_STATE_FOLLOW_FAIL,
FAILOVER_STATE_NODE_NOTIFICATION_ERROR,
FAILOVER_STATE_ELECTION_RERUN
} FailoverState;
typedef enum
{
ELECTION_NOT_CANDIDATE = -1,
ELECTION_WON,
ELECTION_LOST,
ELECTION_CANCELLED,
ELECTION_RERUN
} ElectionResult;
typedef struct election_stats
{
int visible_nodes;
int shared_upstream_nodes;
int all_nodes;
} election_stats;
typedef struct t_child_node_info
{
int node_id;
char node_name[NAMEDATALEN];
t_server_type type;
NodeAttached attached;
instr_time detached_time;
struct t_child_node_info *next;
} t_child_node_info;
typedef struct t_child_node_info_list
{
t_child_node_info *head;
t_child_node_info *tail;
int node_count;
} t_child_node_info_list;
#define T_CHILD_NODE_INFO_LIST_INITIALIZER { \
NULL, \
NULL, \
0 \
}
static PGconn *upstream_conn = NULL;
static PGconn *primary_conn = NULL;
static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;
static int primary_node_id = UNKNOWN_NODE_ID;
static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
static instr_time last_monitoring_update;
static bool child_nodes_disconnect_command_executed = false;
static ElectionResult do_election(NodeInfoList *sibling_nodes, int *new_primary_id);
static const char *_print_election_result(ElectionResult result);
static FailoverState promote_self(void);
static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
static void check_connection(t_node_info *node_info, PGconn **conn);
static bool check_primary_status(int degraded_monitoring_elapsed);
static void check_primary_child_nodes(t_child_node_info_list *local_child_nodes);
static bool wait_primary_notification(int *new_primary_id);
static FailoverState follow_new_primary(int new_primary_id);
static FailoverState witness_follow_new_primary(int new_primary_id);
static void reset_node_voting_status(void);
static bool do_primary_failover(void);
static bool do_upstream_standby_failover(void);
static bool do_witness_failover(void);
static bool update_monitoring_history(void);
static void handle_sighup(PGconn **conn, t_server_type server_type);
static const char *format_failover_state(FailoverState failover_state);
static ElectionResult execute_failover_validation_command(t_node_info *node_info, election_stats *stats);
static void parse_failover_validation_command(const char *template, t_node_info *node_info, election_stats *stats, PQExpBufferData *out);
static bool check_node_can_follow(PGconn *local_conn, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_info);
static void check_witness_attached(t_node_info *node_info, bool startup);
static t_child_node_info *append_child_node_record(t_child_node_info_list *nodes, int node_id, const char *node_name, t_server_type type, NodeAttached attached);
static void remove_child_node_record(t_child_node_info_list *nodes, int node_id);
static void clear_child_node_info_list(t_child_node_info_list *nodes);
static void parse_child_nodes_disconnect_command(char *parsed_command, char *template, int reporting_node_id);
static void execute_child_nodes_disconnect_command(NodeInfoList *db_child_node_records, t_child_node_info_list *local_child_nodes);
static int try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info);
void
handle_sigint_physical(SIGNAL_ARGS)
{
PGconn *writeable_conn;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("%s signal received"),
postgres_signal_arg == SIGTERM
? "TERM" : "INT");
log_notice("%s", event_details.data);
if (local_node_info.type == PRIMARY)
writeable_conn = local_conn;
else
writeable_conn = primary_conn;
if (PQstatus(writeable_conn) == CONNECTION_OK)
create_event_notification(writeable_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_shutdown",
true,
event_details.data);
termPQExpBuffer(&event_details);
terminate(SUCCESS);
}
/* perform some sanity checks on the node's configuration */
void
do_physical_node_check(PGconn *conn)
{
/*
* If node record is "inactive"; if not, attempt to set it to "active".
*
* Usually it will have become inactive due to e.g. a standby being shut down
* while repmgrd was running in an unpaused state. In this case it's
* perfectly reasonable to automatically mark the node as "active".
*/
if (local_node_info.active == false)
{
char *hint = "Check that \"repmgr (primary|standby) register\" was executed for this node";
RecoveryType recovery_type = get_recovery_type(conn);
/*
* If the local node's recovery status is incompatible with its registered
* status, e.g. registered as primary but running as a standby, refuse to start.
*
* This typically happens when a failed primary is recloned but the node was not
* re-registered, leaving the cluster in a potentially ambiguous state. In
* this case it would not be possible or desirable to attempt to set the
* node to active; the user should ensure the cluster is in the correct state.
*/
if (recovery_type != RECTYPE_UNKNOWN && local_node_info.type != UNKNOWN)
{
bool require_reregister = false;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
if (recovery_type == RECTYPE_STANDBY && local_node_info.type != STANDBY)
{
appendPQExpBuffer(&event_details,
_("node is registered as a %s but running as a standby"),
get_node_type_string(local_node_info.type));
require_reregister = true;
}
else if (recovery_type == RECTYPE_PRIMARY && local_node_info.type == STANDBY)
{
log_error(_("node is registered as a standby but running as a %s"), get_node_type_string(local_node_info.type));
require_reregister = true;
}
if (require_reregister == true)
{
log_error("%s", event_details.data);
log_hint(_("%s"), hint);
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_start",
false,
event_details.data);
termPQExpBuffer(&event_details);
terminate(ERR_BAD_CONFIG);
}
termPQExpBuffer(&event_details);
}
/*
* Attempt to set node record active (unless explicitly configured not to)
*/
log_notice(_("setting node record for node \"%s\" (ID: %i) to \"active\""),
local_node_info.node_name,
local_node_info.node_id);
if (config_file_options.repmgrd_exit_on_inactive_node == false)
{
PGconn *primary_conn = get_primary_connection(conn, NULL, NULL);
bool success = true;
if (PQstatus(primary_conn) != CONNECTION_OK)
{
log_error(_("unable to connect to the primary node to activate the node record"));
success = false;
}
else
{
success = update_node_record_set_active(primary_conn, local_node_info.node_id, true);
PQfinish(primary_conn);
}
if (success == true)
{
local_node_info.active = true;
}
}
/*
* Corner-case where it was not possible to set the node to "active"
*/
if (local_node_info.active == false)
{
switch (config_file_options.failover)
{
/* "failover" is an enum, all values should be covered here */
case FAILOVER_AUTOMATIC:
log_error(_("this node is marked as inactive and cannot be used as a failover target"));
log_hint(_("%s"), hint);
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_start",
false,
"node is inactive and cannot be used as a failover target");
terminate(ERR_BAD_CONFIG);
break;
case FAILOVER_MANUAL:
log_warning(_("this node is marked as inactive and will be passively monitored only"));
log_hint(_("%s"), hint);
break;
}
}
}
if (config_file_options.failover == FAILOVER_AUTOMATIC)
{
/*
* Check that "promote_command" and "follow_command" are defined, otherwise repmgrd
* won't be able to perform any useful action in a failover situation.
*/
bool required_param_missing = false;
if (config_file_options.promote_command[0] == '\0')
{
log_error(_("\"promote_command\" must be defined in the configuration file"));
if (config_file_options.service_promote_command[0] != '\0')
{
/*
* "service_promote_command" is *not* a substitute for "promote_command";
* it is intended for use in those systems (e.g. Debian) where there's a service
* level promote command (e.g. pg_ctlcluster).
*
* "promote_command" should either execute "repmgr standby promote" directly, or
* a script which executes "repmgr standby promote". This is essential, as the
* repmgr metadata is updated by "repmgr standby promote".
*
* "service_promote_command", if set, will be executed by "repmgr standby promote",
* but never by repmgrd.
*
*/
log_hint(_("\"service_promote_command\" is set, but can only be executed by \"repmgr standby promote\""));
}
required_param_missing = true;
}
if (config_file_options.follow_command[0] == '\0')
{
log_error(_("\"follow_command\" must be defined in the configuration file"));
required_param_missing = true;
}
if (required_param_missing == true)
{
log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
terminate(ERR_BAD_CONFIG);
}
}
}
/*
* repmgrd running on the primary server
*/
void
monitor_streaming_primary(void)
{
instr_time log_status_interval_start;
instr_time child_nodes_check_interval_start;
t_child_node_info_list local_child_nodes = T_CHILD_NODE_INFO_LIST_INITIALIZER;
reset_node_voting_status();
repmgrd_set_upstream_node_id(local_conn, NO_UPSTREAM_NODE);
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("monitoring cluster primary \"%s\" (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id);
/* Log startup event */
if (startup_event_logged == false)
{
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_start",
true,
event_details.data);
startup_event_logged = true;
}
else
{
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_reload",
true,
event_details.data);
}
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
}
INSTR_TIME_SET_CURRENT(log_status_interval_start);
INSTR_TIME_SET_CURRENT(child_nodes_check_interval_start);
local_node_info.node_status = NODE_STATUS_UP;
/*
* get list of expected and attached nodes
*/
{
NodeInfoList db_child_node_records = T_NODE_INFO_LIST_INITIALIZER;
bool success = get_child_nodes(local_conn, config_file_options.node_id, &db_child_node_records);
if (!success)
{
log_error(_("unable to retrieve list of child nodes"));
}
else
{
NodeInfoListCell *cell;
for (cell = db_child_node_records.head; cell; cell = cell->next)
{
/*
* At startup, if a node for which a repmgr record exists, is not found
* in pg_stat_replication, we can't know whether it has become detached, or
* (e.g. during a provisioning operation) is a new node which has not yet
* attached. We set the status to "NODE_ATTACHED_UNKNOWN" to stop repmgrd
* emitting bogus "node has become detached" alerts.
*/
(void) append_child_node_record(&local_child_nodes,
cell->node_info->node_id,
cell->node_info->node_name,
cell->node_info->type,
cell->node_info->attached == NODE_ATTACHED ? NODE_ATTACHED : NODE_ATTACHED_UNKNOWN);
/*
* witness will not be "attached" in the normal way
*/
if (cell->node_info->type == WITNESS)
{
check_witness_attached(cell->node_info, true);
}
if (cell->node_info->attached == NODE_ATTACHED)
{
log_info(_("child node \"%s\" (ID: %i) is attached"),
cell->node_info->node_name,
cell->node_info->node_id);
}
else
{
log_info(_("child node \"%s\" (ID: %i) is not yet attached"),
cell->node_info->node_name,
cell->node_info->node_id);
}
}
}
}
while (true)
{
/*
* TODO: cache node list here, refresh at `node_list_refresh_interval`
* also return reason for inavailability so we can log it
*/
(void) connection_ping(local_conn);
check_connection(&local_node_info, &local_conn);
if (PQstatus(local_conn) != CONNECTION_OK)
{
/* local node is down, we were expecting it to be up */
if (local_node_info.node_status == NODE_STATUS_UP)
{
instr_time local_node_unreachable_start;
INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBufferStr(&event_details,
_("unable to connect to local node"));
log_warning("%s", event_details.data);
/*
* as we're monitoring the primary, no point in trying to
* write the event to the database
*
* TODO: possibly add pre-action event here
*/
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_local_disconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
local_node_info.node_status = NODE_STATUS_UNKNOWN;
try_reconnect(&local_conn, &local_node_info);
if (local_node_info.node_status == NODE_STATUS_UP)
{
int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
int stored_local_node_id = UNKNOWN_NODE_ID;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to local node after %i seconds"),
local_node_unreachable_elapsed);
log_notice("%s", event_details.data);
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_local_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
/*
* If the local node was restarted, we'll need to reinitialise values
* stored in shared memory.
*/
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
if (stored_local_node_id == UNKNOWN_NODE_ID)
{
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
/*
* check that the local node is still primary, otherwise switch
* to standby monitoring
*/
if (check_primary_status(NO_DEGRADED_MONITORING_ELAPSED) == false)
return;
goto loop;
}
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
log_notice(_("unable to connect to local node, falling back to degraded monitoring"));
}
}
if (monitoring_state == MS_DEGRADED)
{
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
if (config_file_options.degraded_monitoring_timeout > 0
&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("degraded monitoring timeout (%i seconds) exceeded, terminating"),
degraded_monitoring_elapsed);
log_notice("%s", event_details.data);
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_shutdown",
true,
event_details.data);
termPQExpBuffer(&event_details);
terminate(ERR_MONITORING_TIMEOUT);
}
log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);
if (is_server_available(local_node_info.conninfo) == true)
{
close_connection(&local_conn);
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_warning(_("node appears to be up but no connection could be made"));
close_connection(&local_conn);
}
else
{
local_node_info.node_status = NODE_STATUS_UP;
if (check_primary_status(degraded_monitoring_elapsed) == false)
return;
goto loop;
}
}
/*
* possibly attempt to find another node from cached list check if
* there's a new primary - if so add hook for fencing? loop, if
* starts up check status, switch monitoring mode
*/
}
else
{
if (config_file_options.child_nodes_check_interval > 0)
{
int child_nodes_check_interval_elapsed = calculate_elapsed(child_nodes_check_interval_start);
if (child_nodes_check_interval_elapsed >= config_file_options.child_nodes_check_interval)
{
INSTR_TIME_SET_CURRENT(child_nodes_check_interval_start);
check_primary_child_nodes(&local_child_nodes);
}
}
}
loop:
/* check node is still primary, if not restart monitoring */
if (check_primary_status(NO_DEGRADED_MONITORING_ELAPSED) == false)
return;
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
{
log_info(_("monitoring primary node \"%s\" (ID: %i) in %s state"),
local_node_info.node_name,
local_node_info.node_id,
print_monitoring_state(monitoring_state));
if (monitoring_state == MS_DEGRADED)
{
log_detail(_("waiting for the node to become available"));
}
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
if (got_SIGHUP)
{
handle_sighup(&local_conn, PRIMARY);
}
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
}
}
/*
* If monitoring a primary, it's possible that after an outage of the local node
* (due to e.g. a switchover), the node has come back as a standby. We therefore
* need to verify its status and if everything looks OK, restart monitoring in
* standby mode.
*
* Returns "true" to indicate repmgrd should continue monitoring the node as
* a primary; "false" indicates repmgrd should start monitoring the node as
* a standby.
*/
bool
check_primary_status(int degraded_monitoring_elapsed)
{
PGconn *new_primary_conn;
RecordStatus record_status;
bool resume_monitoring = true;
RecoveryType recovery_type = get_recovery_type(local_conn);
if (recovery_type == RECTYPE_UNKNOWN)
{
log_warning(_("unable to determine node recovery status"));
/* "true" to indicate repmgrd should continue monitoring in degraded state */
return true;
}
/* node is still primary - resume monitoring */
if (recovery_type == RECTYPE_PRIMARY)
{
if (degraded_monitoring_elapsed != NO_DEGRADED_MONITORING_ELAPSED)
{
PQExpBufferData event_details;
monitoring_state = MS_NORMAL;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to primary node after %i seconds, resuming monitoring"),
degraded_monitoring_elapsed);
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_local_reconnect",
true,
event_details.data);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
}
return true;
}
/* the node is now a standby */
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
if (degraded_monitoring_elapsed != NO_DEGRADED_MONITORING_ELAPSED)
{
appendPQExpBuffer(&event_details,
_("reconnected to node after %i seconds, node is now a standby, switching to standby monitoring"),
degraded_monitoring_elapsed);
}
else
{
appendPQExpBufferStr(&event_details,
_("node is now a standby, switching to standby monitoring"));
}
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
}
primary_node_id = UNKNOWN_NODE_ID;
new_primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
if (PQstatus(new_primary_conn) != CONNECTION_OK)
{
if (primary_node_id == UNKNOWN_NODE_ID)
{
log_warning(_("unable to determine a new primary node"));
}
else
{
log_warning(_("unable to connect to new primary node %i"), primary_node_id);
log_detail("\n%s", PQerrorMessage(new_primary_conn));
}
close_connection(&new_primary_conn);
/* "true" to indicate repmgrd should continue monitoring in degraded state */
return true;
}
log_debug("primary node ID is now %i", primary_node_id);
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
/*
* If, for whatever reason, the new primary has no record of this node,
* we won't be able to perform proper monitoring. In that case
* terminate and let the user sort out the situation.
*/
if (record_status == RECORD_NOT_FOUND)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("no metadata record found for this node on current primary %i"),
primary_node_id);
log_error("%s", event_details.data);
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
close_connection(&new_primary_conn);
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_shutdown",
false,
event_details.data);
termPQExpBuffer(&event_details);
terminate(ERR_BAD_CONFIG);
}
log_debug("node %i is registered with type = %s",
config_file_options.node_id,
get_node_type_string(local_node_info.type));
/*
* node has recovered but metadata not updated - we can do that ourselves,
*/
if (local_node_info.type == PRIMARY)
{
log_notice(_("node \"%s\" (ID: %i) still registered as primary, setting to standby"),
config_file_options.node_name,
config_file_options.node_id);
if (update_node_record_set_active_standby(new_primary_conn, config_file_options.node_id) == false)
{
resume_monitoring = false;
}
else
{
/* refresh our copy of the node record from the primary */
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
/* this is unlikely to happen */
if (record_status != RECORD_FOUND)
{
log_warning(_("unable to retrieve local node record from primary node %i"), primary_node_id);
resume_monitoring = false;
}
}
}
if (resume_monitoring == true)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
if (degraded_monitoring_elapsed != NO_DEGRADED_MONITORING_ELAPSED)
{
monitoring_state = MS_NORMAL;
log_notice(_("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"),
degraded_monitoring_elapsed);
appendPQExpBuffer(&event_details,
_("node restored as standby after %i seconds, monitoring connection to upstream node %i"),
degraded_monitoring_elapsed,
local_node_info.upstream_node_id);
}
else
{
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
{
/*
* If upstream_node_id is not set, it's possible that following a switchover
* of some kind (possibly forced in some way), the updated node record has
* not yet propagated to the local node. In this case however we can safely
* assume we're monitoring the primary.
*/
appendPQExpBuffer(&event_details,
_("node has become a standby, monitoring connection to primary node %i"),
primary_node_id);
}
else
{
appendPQExpBuffer(&event_details,
_("node has become a standby, monitoring connection to upstream node %i"),
local_node_info.upstream_node_id);
}
}
create_event_notification(new_primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_standby_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
close_connection(&new_primary_conn);
/* restart monitoring as standby */
return false;
}
/* continue monitoring as before */
return true;
}
static void
check_primary_child_nodes(t_child_node_info_list *local_child_nodes)
{
NodeInfoList db_child_node_records = T_NODE_INFO_LIST_INITIALIZER;
NodeInfoListCell *cell;
/* lists for newly attached and missing nodes */
t_child_node_info_list disconnected_child_nodes = T_CHILD_NODE_INFO_LIST_INITIALIZER;
t_child_node_info_list reconnected_child_nodes = T_CHILD_NODE_INFO_LIST_INITIALIZER;
t_child_node_info_list new_child_nodes = T_CHILD_NODE_INFO_LIST_INITIALIZER;
bool success = get_child_nodes(local_conn, config_file_options.node_id, &db_child_node_records);
if (!success)
{
/* unlikely this will happen, but if it does, we'll try again next time round */
log_error(_("unable to retrieve list of child nodes"));
return;
}
if (db_child_node_records.node_count == 0)
{
/* no registered child nodes - nothing to do */
return;
}
/*
* compare DB records with our internal list;
* this will tell us about:
* - previously known nodes and their current status
* - newly registered nodes we didn't know about
*
* We'll need to compare the opposite way to check for nodes
* which are in the internal list, but which have now vanished
*/
for (cell = db_child_node_records.head; cell; cell = cell->next)
{
t_child_node_info *local_child_node_rec;
bool local_child_node_rec_found = false;
/*
* witness will not be "attached" in the normal way
*/
if (cell->node_info->type == WITNESS)
{
check_witness_attached(cell->node_info, false);
}
log_debug("child node: %i; attached: %s",
cell->node_info->node_id,
cell->node_info->attached == NODE_ATTACHED ? "yes" : "no");
for (local_child_node_rec = local_child_nodes->head; local_child_node_rec; local_child_node_rec = local_child_node_rec->next)
{
if (local_child_node_rec->node_id == cell->node_info->node_id)
{
local_child_node_rec_found = true;
break;
}
}
if (local_child_node_rec_found == true)
{
/* our node record shows node attached, DB record indicates detached */
if (local_child_node_rec->attached == NODE_ATTACHED && cell->node_info->attached == NODE_DETACHED)
{
t_child_node_info *detached_child_node;
local_child_node_rec->attached = NODE_DETACHED;
INSTR_TIME_SET_CURRENT(local_child_node_rec->detached_time);
detached_child_node = append_child_node_record(&disconnected_child_nodes,
local_child_node_rec->node_id,
local_child_node_rec->node_name,
local_child_node_rec->type,
NODE_DETACHED);
detached_child_node->detached_time = local_child_node_rec->detached_time;
}
/* our node record shows node detached, DB record indicates attached */
else if (local_child_node_rec->attached == NODE_DETACHED && cell->node_info->attached == NODE_ATTACHED)
{
t_child_node_info *attached_child_node;
local_child_node_rec->attached = NODE_ATTACHED;
attached_child_node = append_child_node_record(&reconnected_child_nodes,
local_child_node_rec->node_id,
local_child_node_rec->node_name,
local_child_node_rec->type,
NODE_ATTACHED);
attached_child_node->detached_time = local_child_node_rec->detached_time;
INSTR_TIME_SET_ZERO(local_child_node_rec->detached_time);
}
else if (local_child_node_rec->attached == NODE_ATTACHED_UNKNOWN && cell->node_info->attached == NODE_ATTACHED)
{
local_child_node_rec->attached = NODE_ATTACHED;
append_child_node_record(&new_child_nodes,
local_child_node_rec->node_id,
local_child_node_rec->node_name,
local_child_node_rec->type,
NODE_ATTACHED);
}
}
else
{
/* node we didn't know about before */
NodeAttached attached = cell->node_info->attached;
/*
* node registered but not attached - set state to "UNKNOWN"
* to prevent a bogus "reattach" event being generated
*/
if (attached == NODE_DETACHED)
attached = NODE_ATTACHED_UNKNOWN;
(void) append_child_node_record(local_child_nodes,
cell->node_info->node_id,
cell->node_info->node_name,
cell->node_info->type,
attached);
(void) append_child_node_record(&new_child_nodes,
cell->node_info->node_id,
cell->node_info->node_name,
cell->node_info->type,
attached);
}
}
/*
* Check if any nodes in local list are no longer in list returned
* from database.
*/
{
t_child_node_info *local_child_node_rec;
bool db_node_rec_found = false;
for (local_child_node_rec = local_child_nodes->head; local_child_node_rec; local_child_node_rec = local_child_node_rec->next)
{
for (cell = db_child_node_records.head; cell; cell = cell->next)
{
if (cell->node_info->node_id == local_child_node_rec->node_id)
{
db_node_rec_found = true;
break;
}
}
if (db_node_rec_found == false)
{
log_notice(_("%s node \"%s\" (ID: %i) is no longer connected or registered"),
get_node_type_string(local_child_node_rec->type),
local_child_node_rec->node_name,
local_child_node_rec->node_id);
remove_child_node_record(local_child_nodes, local_child_node_rec->node_id);
}
}
}
/* generate "child_node_disconnect" events */
if (disconnected_child_nodes.node_count > 0)
{
t_child_node_info *child_node_rec;
for (child_node_rec = disconnected_child_nodes.head; child_node_rec; child_node_rec = child_node_rec->next)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("%s node \"%s\" (ID: %i) has disconnected"),
get_node_type_string(child_node_rec->type),
child_node_rec->node_name,
child_node_rec->node_id);
log_notice("%s", event_details.data);
create_event_notification(local_conn,
&config_file_options,
local_node_info.node_id,
"child_node_disconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
}
/* generate "child_node_reconnect" events */
if (reconnected_child_nodes.node_count > 0)
{
t_child_node_info *child_node_rec;
for (child_node_rec = reconnected_child_nodes.head; child_node_rec; child_node_rec = child_node_rec->next)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("%s node \"%s\" (ID: %i) has reconnected after %i seconds"),
get_node_type_string(child_node_rec->type),
child_node_rec->node_name,
child_node_rec->node_id,
calculate_elapsed( child_node_rec->detached_time ));
log_notice("%s", event_details.data);
create_event_notification(local_conn,
&config_file_options,
local_node_info.node_id,
"child_node_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
}
/* generate "child_node_new_connect" events */
if (new_child_nodes.node_count > 0)
{
t_child_node_info *child_node_rec;
for (child_node_rec = new_child_nodes.head; child_node_rec; child_node_rec = child_node_rec->next)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("new %s \"%s\" (ID: %i) has connected"),
get_node_type_string(child_node_rec->type),
child_node_rec->node_name,
child_node_rec->node_id);
log_notice("%s", event_details.data);
create_event_notification(local_conn,
&config_file_options,
local_node_info.node_id,
"child_node_new_connect",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
}
if (config_file_options.child_nodes_disconnect_command[0] != '\0')
{
bool repmgrd_paused = repmgrd_is_paused(local_conn);
if (repmgrd_paused == false)
{
/* check criteria for execution, and execute if criteria met */
execute_child_nodes_disconnect_command(&db_child_node_records, local_child_nodes);
}
}
clear_child_node_info_list(&disconnected_child_nodes);
clear_child_node_info_list(&reconnected_child_nodes);
clear_child_node_info_list(&new_child_nodes);
clear_node_info_list(&db_child_node_records);
}
void
execute_child_nodes_disconnect_command(NodeInfoList *db_child_node_records, t_child_node_info_list *local_child_nodes)
{
/*
* script will only be executed if the number of attached
* standbys is lower than this number
*/
int min_required_connected_count = 1;
int connected_count = 0;
NodeInfoListCell *cell;
/*
* Calculate minimum number of nodes which need to be connected
* (if the total falls below that, "child_nodes_disconnect_command"
* will be executed)
*/
if (config_file_options.child_nodes_connected_min_count > 0)
{
min_required_connected_count = config_file_options.child_nodes_connected_min_count;
}
else if (config_file_options.child_nodes_disconnect_min_count > 0)
{
int child_node_count = db_child_node_records->node_count;
if (config_file_options.child_nodes_connected_include_witness == false)
{
/* reduce total, if witness server in child node list */
for (cell = db_child_node_records->head; cell; cell = cell->next)
{
if (cell->node_info->type == WITNESS)
{
child_node_count--;
break;
}
}
}
min_required_connected_count =
(child_node_count - config_file_options.child_nodes_disconnect_min_count)
+ 1;
}
/* calculate number of connected child nodes */
for (cell = db_child_node_records->head; cell; cell = cell->next)
{
/* exclude witness server from total, if necessary */
if (config_file_options.child_nodes_connected_include_witness == false &&
cell->node_info->type == WITNESS)
continue;
if (cell->node_info->attached == NODE_ATTACHED)
connected_count ++;
}
log_debug("connected: %i; min required: %i",
connected_count,
min_required_connected_count);
if (connected_count < min_required_connected_count)
{
log_notice(_("%i (of %i) child nodes are connected, but at least %i child nodes required"),
connected_count,
db_child_node_records->node_count,
min_required_connected_count);
if (child_nodes_disconnect_command_executed == false)
{
t_child_node_info *child_node_rec;
/* set these for informative purposes */
int most_recently_disconnected_node_id = UNKNOWN_NODE_ID;
int most_recently_disconnected_elapsed = -1;
bool most_recent_disconnect_below_threshold = false;
instr_time current_time_base;
INSTR_TIME_SET_CURRENT(current_time_base);
for (child_node_rec = local_child_nodes->head; child_node_rec; child_node_rec = child_node_rec->next)
{
instr_time current_time = current_time_base;
int seconds_since_detached;
/* exclude witness server from calculation, if requested */
if (config_file_options.child_nodes_connected_include_witness == false &&
child_node_rec->type == WITNESS)
continue;
if (child_node_rec->attached != NODE_DETACHED)
continue;
INSTR_TIME_SUBTRACT(current_time, child_node_rec->detached_time);
seconds_since_detached = (int) INSTR_TIME_GET_DOUBLE(current_time);
if (seconds_since_detached < config_file_options.child_nodes_disconnect_timeout)
{
most_recent_disconnect_below_threshold = true;
}
if (most_recently_disconnected_node_id == UNKNOWN_NODE_ID)
{
most_recently_disconnected_node_id = child_node_rec->node_id;
most_recently_disconnected_elapsed = seconds_since_detached;
}
else if (seconds_since_detached < most_recently_disconnected_elapsed)
{
most_recently_disconnected_node_id = child_node_rec->node_id;
most_recently_disconnected_elapsed = seconds_since_detached;
}
}
if (most_recent_disconnect_below_threshold == false && most_recently_disconnected_node_id != UNKNOWN_NODE_ID)
{
char parsed_child_nodes_disconnect_command[MAXPGPATH];
int child_nodes_disconnect_command_result;
PQExpBufferData event_details;
bool success = true;
parse_child_nodes_disconnect_command(parsed_child_nodes_disconnect_command,
config_file_options.child_nodes_disconnect_command,
local_node_info.node_id);
log_info(_("most recently detached child node was %i (ca. %i seconds ago), triggering \"child_nodes_disconnect_command\""),
most_recently_disconnected_node_id,
most_recently_disconnected_elapsed);
log_info(_("\"child_nodes_disconnect_command\" is:\n \"%s\""),
parsed_child_nodes_disconnect_command);
child_nodes_disconnect_command_result = system(parsed_child_nodes_disconnect_command);
initPQExpBuffer(&event_details);
if (child_nodes_disconnect_command_result != 0)
{
success = false;
appendPQExpBufferStr(&event_details,
_("unable to execute \"child_nodes_disconnect_command\""));
log_error("%s", event_details.data);
}
else
{
appendPQExpBufferStr(&event_details,
_("\"child_nodes_disconnect_command\" successfully executed"));
log_info("%s", event_details.data);
}
create_event_notification(local_conn,
&config_file_options,
local_node_info.node_id,
"child_nodes_disconnect_command",
success,
event_details.data);
termPQExpBuffer(&event_details);
child_nodes_disconnect_command_executed = true;
}
else if (most_recently_disconnected_node_id != UNKNOWN_NODE_ID)
{
log_info(_("most recently detached child node was %i (ca. %i seconds ago), not triggering \"child_nodes_disconnect_command\""),
most_recently_disconnected_node_id,
most_recently_disconnected_elapsed);
log_detail(_("\"child_nodes_disconnect_timeout\" set to %i seconds"),
config_file_options.child_nodes_disconnect_timeout);
}
else
{
log_info(_("no child nodes have detached since repmgrd startup"));
}
}
else
{
log_info(_("\"child_nodes_disconnect_command\" was previously executed, taking no action"));
}
}
else
{
/*
* "child_nodes_disconnect_command" was executed, but for whatever reason
* enough child nodes have returned to clear the threshold; in that case reset
* the executed flag so we can execute the command again, if necessary
*/
if (child_nodes_disconnect_command_executed == true)
{
log_notice(_("%i (of %i) child nodes are now connected, meeting minimum requirement of %i child nodes"),
connected_count,
db_child_node_records->node_count,
min_required_connected_count);
child_nodes_disconnect_command_executed = false;
}
}
}
/*
* repmgrd running on a standby server
*/
void
monitor_streaming_standby(void)
{
RecordStatus record_status;
instr_time log_status_interval_start;
MonitoringState local_monitoring_state = MS_NORMAL;
instr_time local_degraded_monitoring_start;
bool upstream_check_result;
int degraded_monitoring_elapsed;
int last_known_upstream_node_id = UNKNOWN_NODE_ID;
log_debug("monitor_streaming_standby()");
reset_node_voting_status();
INSTR_TIME_SET_ZERO(last_monitoring_update);
/*
* If no upstream node id is specified in the metadata, we'll try and
* determine the current cluster primary in the assumption we should
* connect to that by default.
*/
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
{
upstream_conn = get_primary_connection(local_conn, &local_node_info.upstream_node_id, NULL);
/*
* Terminate if there doesn't appear to be an active cluster primary.
* There could be one or more nodes marked as inactive primaries, and
* one of them could actually be a primary, but we can't sensibly
* monitor in that state.
*/
if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
{
log_error(_("unable to determine an active primary for this cluster, terminating"));
terminate(ERR_BAD_CONFIG);
}
log_debug("upstream node ID determined as %i", local_node_info.upstream_node_id);
(void) get_node_record(upstream_conn, local_node_info.upstream_node_id, &upstream_node_info);
}
else
{
log_debug("upstream node ID in local node record is %i", local_node_info.upstream_node_id);
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
/*
* Terminate if we can't find the record for the node we're supposed to
* monitor. This is a "fix-the-config" situation, not a lot else we can
* do.
*/
if (record_status == RECORD_NOT_FOUND)
{
log_error(_("no record found for upstream node (ID: %i), terminating"),
local_node_info.upstream_node_id);
log_hint(_("ensure the upstream node is registered correctly"));
terminate(ERR_DB_CONN);
}
else if (record_status == RECORD_ERROR)
{
log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
local_node_info.upstream_node_id);
terminate(ERR_DB_CONN);
}
log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
}
/*
* Upstream node must be running at repmgrd startup.
*
* We could possibly have repmgrd skip to degraded monitoring mode until
* it comes up, but there doesn't seem to be much point in doing that.
*/
if (PQstatus(upstream_conn) != CONNECTION_OK)
{
close_connection(&upstream_conn);
log_error(_("unable connect to upstream node (ID: %i), terminating"),
local_node_info.upstream_node_id);
log_hint(_("upstream node must be running before repmgrd can start"));
terminate(ERR_DB_CONN);
}
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
if (upstream_node_info.node_id == local_node_info.node_id)
{
close_connection(&upstream_conn);
return;
}
last_known_upstream_node_id = local_node_info.upstream_node_id;
/*
* refresh upstream node record from upstream node, so it's as up-to-date
* as possible
*/
record_status = get_node_record(upstream_conn, upstream_node_info.node_id, &upstream_node_info);
if (upstream_node_info.type == STANDBY)
{
log_debug("upstream node is standby, connecting to primary");
/*
* Currently cascaded standbys need to be able to connect to the
* primary. We could possibly add a limited connection mode for cases
* where this isn't possible, but that will complicate things further.
*/
primary_conn = establish_primary_db_connection(upstream_conn, false);
if (PQstatus(primary_conn) != CONNECTION_OK)
{
close_connection(&primary_conn);
log_error(_("unable to connect to primary node"));
log_hint(_("ensure the primary node is reachable from this node"));
terminate(ERR_DB_CONN);
}
log_verbose(LOG_DEBUG, "connected to primary");
}
else
{
log_debug("upstream node is primary");
primary_conn = upstream_conn;
}
/*
* It's possible monitoring has been restarted after some outage which
* resulted in the local node being marked as inactive; if so mark it
* as active again.
*/
if (local_node_info.active == false)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
local_node_info.active = true;
}
}
if (PQstatus(primary_conn) == CONNECTION_OK)
{
primary_node_id = get_primary_node_id(primary_conn);
log_debug("primary_node_id is %i", primary_node_id);
}
else
{
primary_node_id = get_primary_node_id(local_conn);
log_debug("primary_node_id according to local records is %i", primary_node_id);
}
/* Log startup event */
if (startup_event_logged == false)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("monitoring connection to upstream node \"%s\" (ID: %i)"),
upstream_node_info.node_name,
upstream_node_info.node_id);
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_start",
true,
event_details.data);
startup_event_logged = true;
log_info("%s", event_details.data);
termPQExpBuffer(&event_details);
}
monitoring_state = MS_NORMAL;
INSTR_TIME_SET_CURRENT(log_status_interval_start);
upstream_node_info.node_status = NODE_STATUS_UP;
while (true)
{
log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
if (upstream_node_info.type == PRIMARY)
{
upstream_check_result = check_upstream_connection(&upstream_conn, upstream_node_info.conninfo, &primary_conn);
}
else
{
upstream_check_result = check_upstream_connection(&upstream_conn, upstream_node_info.conninfo, NULL);
}
if (upstream_check_result == true)
{
set_upstream_last_seen(local_conn, upstream_node_info.node_id);
}
else
{
/* upstream node is down, we were expecting it to be up */
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
instr_time upstream_node_unreachable_start;
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
upstream_node_info.node_status = NODE_STATUS_UNKNOWN;
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("unable to connect to upstream node \"%s\" (ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
/* TODO: possibly add pre-action event here */
if (upstream_node_info.type == STANDBY)
{
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_disconnect",
true,
event_details.data);
}
else
{
/* primary connection lost - script notification only */
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_disconnect",
true,
event_details.data);
}
log_warning("%s", event_details.data);
termPQExpBuffer(&event_details);
}
/*
* if local node is unreachable, make a last-minute attempt to reconnect
* before continuing with the failover process
*/
if (PQstatus(local_conn) != CONNECTION_OK)
{
check_connection(&local_node_info, &local_conn);
}
if (upstream_node_info.type == PRIMARY)
{
primary_node_id = try_primary_reconnect(&upstream_conn, local_conn, &upstream_node_info);
/*
* We were notified by the the primary during our own reconnection
* retry phase, in which case we can leave the failover process early
* and connect to the new primary.
*/
if (primary_node_id == ELECTION_RERUN_NOTIFICATION)
{
if (do_primary_failover() == true)
{
primary_node_id = get_primary_node_id(local_conn);
return;
}
}
if (primary_node_id != UNKNOWN_NODE_ID && primary_node_id != ELECTION_RERUN_NOTIFICATION)
{
follow_new_primary(primary_node_id);
return;
}
}
else
{
try_reconnect(&upstream_conn, &upstream_node_info);
}
/* Upstream node has recovered - log and continue */
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to upstream node after %i seconds"),
upstream_node_unreachable_elapsed);
log_notice("%s", event_details.data);
if (upstream_node_info.type == PRIMARY)
{
primary_conn = upstream_conn;
if (get_recovery_type(primary_conn) == RECTYPE_STANDBY)
{
ExecStatusType ping_result;
/*
* we're returning at the end of this block and no longer require the
* event details buffer
*/
termPQExpBuffer(&event_details);
log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
close_connection(&upstream_conn);
local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
/* check local connection */
ping_result = connection_ping(local_conn);
if (ping_result != PGRES_TUPLES_OK)
{
int i;
close_connection(&local_conn);
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
{
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) == CONNECTION_OK)
break;
close_connection(&local_conn);
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
i + 1,
config_file_options.repmgrd_standby_startup_timeout);
sleep(1);
}
}
return;
}
}
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
/* upstream is still down after reconnect attempt(s) */
if (upstream_node_info.node_status == NODE_STATUS_DOWN)
{
bool failover_done = false;
if (PQstatus(local_conn) == CONNECTION_OK && repmgrd_is_paused(local_conn))
{
log_notice(_("repmgrd on this node is paused"));
log_detail(_("no failover will be carried out"));
log_hint(_("execute \"repmgr service unpause\" to resume normal failover mode"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
}
else
{
if (upstream_node_info.type == PRIMARY)
{
failover_done = do_primary_failover();
}
else if (upstream_node_info.type == STANDBY)
{
failover_done = do_upstream_standby_failover();
if (failover_done == false)
{
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
}
}
/*
* XXX it's possible it will make sense to return in all
* cases to restart monitoring
*/
if (failover_done == true)
{
primary_node_id = get_primary_node_id(local_conn);
return;
}
}
}
}
}
if (monitoring_state == MS_DEGRADED)
{
degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
if (config_file_options.degraded_monitoring_timeout > 0
&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("degraded monitoring timeout (%i seconds) exceeded, terminating"),
degraded_monitoring_elapsed);
log_notice("%s", event_details.data);
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_shutdown",
true,
event_details.data);
termPQExpBuffer(&event_details);
terminate(ERR_MONITORING_TIMEOUT);
}
log_debug("monitoring upstream node %i in degraded state for %i seconds",
upstream_node_info.node_id,
degraded_monitoring_elapsed);
if (upstream_node_info.type == PRIMARY)
{
upstream_check_result = check_upstream_connection(&upstream_conn, upstream_node_info.conninfo, &primary_conn);
}
else
{
upstream_check_result = check_upstream_connection(&upstream_conn, upstream_node_info.conninfo, NULL);
}
if (upstream_check_result == true)
{
if (config_file_options.connection_check_type != CHECK_QUERY)
{
close_connection(&upstream_conn);
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
}
if (PQstatus(upstream_conn) == CONNECTION_OK)
{
PQExpBufferData event_details;
log_debug("upstream node %i has recovered",
upstream_node_info.node_id);
/* XXX check here if upstream is still primary */
/*
* -> will be a problem if another node was promoted in
* the meantime
*/
/* and upstream is now former primary */
/* XXX scan other nodes to see if any has become primary */
upstream_node_info.node_status = NODE_STATUS_UP;
monitoring_state = MS_NORMAL;
if (upstream_node_info.type == PRIMARY)
{
primary_conn = upstream_conn;
}
else
{
if (primary_conn != NULL && PQstatus(primary_conn) != CONNECTION_OK)
{
close_connection(&primary_conn);
}
if (primary_conn == NULL)
{
primary_conn = establish_primary_db_connection(upstream_conn, false);
}
}
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to upstream node \"%s\" (ID: %i) after %i seconds, resuming monitoring"),
upstream_node_info.node_name,
upstream_node_info.node_id,
degraded_monitoring_elapsed);
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_reconnect",
true,
event_details.data);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
}
else
{
/*
* unable to connect to former primary - check if another node
* has been promoted
*/
/* local node has been promoted */
if (get_recovery_type(local_conn) == RECTYPE_PRIMARY)
{
log_notice(_("local node is primary, checking local node state"));
/*
* It's possible the promote command timed out, but the promotion itself
* succeeded. In this case failover state will be FAILOVER_STATE_PROMOTION_FAILED;
* we can update the node record ourselves and resume primary monitoring.
*/
if (failover_state == FAILOVER_STATE_PROMOTION_FAILED)
{
int former_upstream_node_id = local_node_info.upstream_node_id;
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
PQExpBufferData event_details;
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
update_node_record_set_primary(local_conn, local_node_info.node_id);
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
log_notice(_("resuming monitoring as primary node after %i seconds"),
degraded_monitoring_elapsed);
initPQExpBuffer(&event_details);
appendPQExpBufferStr(&event_details,
_("promotion command failed but promotion completed successfully"));
event_info.node_id = former_upstream_node_id;
create_event_notification_extended(local_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_promote",
true,
event_details.data,
&event_info);
termPQExpBuffer(&event_details);
/* notify former siblings that they should now follow this node */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
former_upstream_node_id,
&sibling_nodes);
notify_followers(&sibling_nodes, local_node_info.node_id);
clear_node_info_list(&sibling_nodes);
/* this will restart monitoring in primary mode */
monitoring_state = MS_NORMAL;
return;
}
/*
* There may be a delay between the node being promoted
* and the local record being updated, so if the node
* record still shows it as a standby, do nothing, we'll
* catch the update during the next loop. (e.g. node was
* manually promoted) we'll do nothing, as the repmgr
* metadata is now out-of-sync. If it does get fixed,
* we'll catch it here on a future iteration.
*/
/* refresh own internal node record */
record_status = refresh_node_record(local_conn, local_node_info.node_id, &local_node_info);
if (local_node_info.type == PRIMARY)
{
degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
log_notice(_("resuming monitoring as primary node after %i seconds"),
degraded_monitoring_elapsed);
/* this will restart monitoring in primary mode */
monitoring_state = MS_NORMAL;
return;
}
}
if (config_file_options.failover == FAILOVER_AUTOMATIC && repmgrd_is_paused(local_conn) == false)
{
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
local_node_info.upstream_node_id,
&sibling_nodes);
if (sibling_nodes.node_count > 0)
{
NodeInfoListCell *cell;
t_node_info *follow_node_info = NULL;
log_debug("scanning %i node records to detect new primary...", sibling_nodes.node_count);
for (cell = sibling_nodes.head; cell; cell = cell->next)
{
/* skip local node check, we did that above */
if (cell->node_info->node_id == local_node_info.node_id)
{
continue;
}
/* skip witness node - we can't possibly "follow" that */
if (cell->node_info->type == WITNESS)
{
continue;
}
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
close_connection(&cell->node_info->conn);
log_debug("unable to connect to %i ... ", cell->node_info->node_id);
close_connection(&cell->node_info->conn);
continue;
}
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
{
follow_node_info = cell->node_info;
close_connection(&cell->node_info->conn);
break;
}
close_connection(&cell->node_info->conn);
}
if (follow_node_info != NULL)
{
log_info(_("node \"%s\" (node ID: %i) detected as primary"),
follow_node_info->node_name,
follow_node_info->node_id);
follow_new_primary(follow_node_info->node_id);
}
}
clear_node_info_list(&sibling_nodes);
}
}
}
loop:
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
{
PQExpBufferData monitoring_summary;
initPQExpBuffer(&monitoring_summary);
appendPQExpBuffer(&monitoring_summary,
_("node \"%s\" (ID: %i) monitoring upstream node \"%s\" (ID: %i) in %s state"),
local_node_info.node_name,
local_node_info.node_id,
upstream_node_info.node_name,
upstream_node_info.node_id,
print_monitoring_state(monitoring_state));
if (config_file_options.failover == FAILOVER_MANUAL)
{
appendPQExpBufferStr(&monitoring_summary,
_(" (automatic failover disabled)"));
}
log_info("%s", monitoring_summary.data);
termPQExpBuffer(&monitoring_summary);
if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
{
if (PQstatus(local_conn) == CONNECTION_OK && repmgrd_is_paused(local_conn))
{
log_detail(_("repmgrd paused by administrator"));
log_hint(_("execute \"repmgr service unpause\" to resume normal failover mode"));
}
else
{
log_detail(_("waiting for upstream or another primary to reappear"));
}
}
/*
* Add update about monitoring updates.
*
* Note: with cascaded replication, it's possible we're still able to write
* monitoring history to the primary even if the upstream is still reachable.
*/
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
{
if (INSTR_TIME_IS_ZERO(last_monitoring_update))
{
log_detail(_("no monitoring statistics have been written yet"));
}
else
{
log_detail(_("last monitoring statistics update was %i seconds ago"),
calculate_elapsed(last_monitoring_update));
}
}
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
{
bool success = update_monitoring_history();
if (success == false && PQstatus(primary_conn) != CONNECTION_OK && upstream_node_info.type == STANDBY)
{
primary_conn = establish_primary_db_connection(local_conn, false);
if (PQstatus(primary_conn) == CONNECTION_OK)
{
(void)update_monitoring_history();
}
}
}
else
{
if (config_file_options.monitoring_history == true)
{
log_verbose(LOG_WARNING, _("monitoring_history requested but primary connection not available"));
}
/*
* if monitoring not in use, we'll need to ensure the local connection
* handle isn't stale
*/
(void) connection_ping(local_conn);
}
/*
* handle local node failure
*
* currently we'll just check the connection, and try to reconnect
*
* TODO: add timeout, after which we run in degraded state
*/
check_connection(&local_node_info, &local_conn);
if (PQstatus(local_conn) != CONNECTION_OK)
{
if (local_node_info.active == true)
{
bool success = true;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
local_node_info.active = false;
appendPQExpBuffer(&event_details,
_("unable to connect to local node \"%s\" (ID: %i), marking inactive"),
local_node_info.node_name,
local_node_info.node_id);
log_notice("%s", event_details.data);
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, false) == false)
{
success = false;
log_warning(_("unable to mark node \"%s\" (ID: %i) as inactive"),
local_node_info.node_name,
local_node_info.node_id);
}
}
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_failure",
success,
event_details.data);
termPQExpBuffer(&event_details);
}
if (local_monitoring_state == MS_NORMAL)
{
log_info("entering degraded monitoring for the local node");
local_monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(local_degraded_monitoring_start);
}
}
else
{
int stored_local_node_id = UNKNOWN_NODE_ID;
if (local_monitoring_state == MS_DEGRADED)
{
log_info(_("connection to local node recovered after %i seconds"),
calculate_elapsed(local_degraded_monitoring_start));
local_monitoring_state = MS_NORMAL;
/*
* Check if anything has changed since the local node came back on line;
* we may need to restart monitoring.
*/
refresh_node_record(local_conn, local_node_info.node_id, &local_node_info);
if (last_known_upstream_node_id != local_node_info.upstream_node_id)
{
log_notice(_("upstream for local node \"%s\" (ID: %i) appears to have changed, restarting monitoring"),
local_node_info.node_name,
local_node_info.node_id);
log_detail(_("currently monitoring upstream %i; new upstream is %i"),
last_known_upstream_node_id,
local_node_info.upstream_node_id);
close_connection(&upstream_conn);
return;
}
/*
*
*/
if (local_node_info.type != STANDBY)
{
log_notice(_("local node \"%s\" (ID: %i) is no longer a standby, restarting monitoring"),
local_node_info.node_name,
local_node_info.node_id);
close_connection(&upstream_conn);
return;
}
}
/*
* If the local node was restarted, we'll need to reinitialise values
* stored in shared memory.
*/
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
if (stored_local_node_id == UNKNOWN_NODE_ID)
{
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (get_recovery_type(primary_conn) == RECTYPE_STANDBY)
{
log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
close_connection(&primary_conn);
local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
return;
}
}
/* we've reconnected to the local node after an outage */
if (local_node_info.active == false)
{
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
local_node_info.active = true;
appendPQExpBuffer(&event_details,
_("reconnected to local node \"%s\" (ID: %i), marking active"),
local_node_info.node_name,
local_node_info.node_id);
log_notice("%s", event_details.data);
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_recovery",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
}
}
}
if (got_SIGHUP)
{
handle_sighup(&local_conn, STANDBY);
}
refresh_node_record(local_conn, local_node_info.node_id, &local_node_info);
if (local_monitoring_state == MS_NORMAL && last_known_upstream_node_id != local_node_info.upstream_node_id)
{
/*
* It's possible that after a change of upstream, the local node record will not
* yet have been updated with the new upstream node ID. Therefore we check the
* node record on the upstream, and if that matches "last_known_upstream_node_id",
* take that as the correct value.
*/
if (monitoring_state == MS_NORMAL)
{
t_node_info node_info_on_upstream = T_NODE_INFO_INITIALIZER;
record_status = get_node_record(primary_conn, config_file_options.node_id, &node_info_on_upstream);
if (last_known_upstream_node_id == node_info_on_upstream.upstream_node_id)
{
local_node_info.upstream_node_id = last_known_upstream_node_id;
}
}
if (last_known_upstream_node_id != local_node_info.upstream_node_id)
{
log_notice(_("local node \"%s\" (ID: %i)'s upstream appears to have changed, restarting monitoring"),
local_node_info.node_name,
local_node_info.node_id);
log_detail(_("currently monitoring upstream %i; new upstream is %i"),
last_known_upstream_node_id,
local_node_info.upstream_node_id);
close_connection(&upstream_conn);
return;
}
}
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
}
}
void
monitor_streaming_witness(void)
{
instr_time log_status_interval_start;
instr_time witness_sync_interval_start;
RecordStatus record_status;
int primary_node_id = UNKNOWN_NODE_ID;
reset_node_voting_status();
log_debug("monitor_streaming_witness()");
/*
* At this point we can't trust the local copy of "repmgr.nodes", as
* it may not have been updated. We'll scan the cluster to find the
* current primary and refresh the copy from that before proceeding
* further.
*/
primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
/*
* Primary node should be running at repmgrd startup.
*
* Otherwise we'll skip to degraded monitoring.
*/
if (PQstatus(primary_conn) == CONNECTION_OK)
{
PQExpBufferData event_details;
char *event_type = startup_event_logged == false
? "repmgrd_start"
: "repmgrd_upstream_reconnect";
/* synchronise local copy of "repmgr.nodes", in case it was stale */
witness_copy_node_records(primary_conn, local_conn);
/*
* refresh upstream node record from primary, so it's as up-to-date
* as possible
*/
record_status = get_node_record(primary_conn, primary_node_id, &upstream_node_info);
/*
* This is unlikely to happen; if it does emit a warning for diagnostic
* purposes and plough on regardless.
*
* A check for the existence of the record will have already been carried out
* in main().
*/
if (record_status != RECORD_FOUND)
{
log_warning(_("unable to retrieve node record from primary"));
}
/* refresh local node record from the primary */
record_status = get_node_record(primary_conn, config_file_options.node_id, &local_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("no metadata record found for this node on the current primary - terminating"));
log_hint(_("check that 'repmgr witness register' was executed for this node"));
close_connection(&primary_conn);
close_connection(&local_conn);
terminate(ERR_BAD_CONFIG);
}
/*
* It's possible that the primary changed while the witness repmgrd was not
* running. This does not affect the functionality of the witness repmgrd, but
* does mean outdated node metadata will be displayed, so update that.
*/
if (local_node_info.upstream_node_id != primary_node_id)
{
update_node_record_set_upstream(primary_conn, local_node_info.node_id, primary_node_id);
local_node_info.upstream_node_id = primary_node_id;
}
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("witness monitoring connection to primary node \"%s\" (ID: %i)"),
upstream_node_info.node_name,
upstream_node_info.node_id);
log_info("%s", event_details.data);
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
event_type,
true,
event_details.data);
if (startup_event_logged == false)
startup_event_logged = true;
termPQExpBuffer(&event_details);
monitoring_state = MS_NORMAL;
INSTR_TIME_SET_CURRENT(log_status_interval_start);
INSTR_TIME_SET_CURRENT(witness_sync_interval_start);
upstream_node_info.node_status = NODE_STATUS_UP;
}
else
{
log_warning(_("unable to connect to primary"));
log_detail("\n%s", PQerrorMessage(primary_conn));
/*
* Here we're unable to connect to a primary despite having scanned all
* known nodes, so we'll grab the record of the node we think is primary
* and continue straight to degraded monitoring in the hope a primary
* will appear.
*/
primary_node_id = get_primary_node_id(local_conn);
log_notice(_("setting primary_node_id to last known ID %i"), primary_node_id);
record_status = get_node_record(local_conn, primary_node_id, &upstream_node_info);
/*
* This is unlikely to happen, but if for whatever reason there's
* no primary record in the local table, we should just give up
*/
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve node record for last known primary %i"),
primary_node_id);
log_hint(_("execute \"repmgr witness register --force\" to sync the local node records"));
PQfinish(local_conn);
terminate(ERR_BAD_CONFIG);
}
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
upstream_node_info.node_status = NODE_STATUS_DOWN;
}
while (true)
{
if (check_upstream_connection(&primary_conn, upstream_node_info.conninfo, NULL) == true)
{
set_upstream_last_seen(local_conn, upstream_node_info.node_id);
}
else
{
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
instr_time upstream_node_unreachable_start;
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
upstream_node_info.node_status = NODE_STATUS_UNKNOWN;
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("unable to connect to primary node \"%s\" (ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
create_event_notification(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_disconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
try_reconnect(&primary_conn, &upstream_node_info);
/* Node has recovered - log and continue */
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to upstream node after %i seconds"),
upstream_node_unreachable_elapsed);
log_notice("%s", event_details.data);
/* check upstream is still primary */
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
{
log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
close_connection(&primary_conn);
termPQExpBuffer(&event_details);
return;
}
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
/* still down after reconnect attempt(s) */
if (upstream_node_info.node_status == NODE_STATUS_DOWN)
{
bool failover_done = false;
failover_done = do_witness_failover();
/*
* XXX it's possible it will make sense to return in all
* cases to restart monitoring
*/
if (failover_done == true)
{
primary_node_id = get_primary_node_id(local_conn);
return;
}
}
}
}
if (monitoring_state == MS_DEGRADED)
{
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
log_debug("monitoring node %i in degraded state for %i seconds",
upstream_node_info.node_id,
degraded_monitoring_elapsed);
if (check_upstream_connection(&primary_conn, upstream_node_info.conninfo, NULL) == true)
{
if (config_file_options.connection_check_type != CHECK_QUERY)
{
close_connection(&primary_conn);
primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
}
if (PQstatus(primary_conn) == CONNECTION_OK)
{
PQExpBufferData event_details;
upstream_node_info.node_status = NODE_STATUS_UP;
monitoring_state = MS_NORMAL;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to upstream node \"%s\" (ID: %i) after %i seconds, resuming monitoring"),
upstream_node_info.node_name,
upstream_node_info.node_id,
degraded_monitoring_elapsed);
log_notice("%s", event_details.data);
/* check upstream is still primary */
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
{
log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name,
upstream_node_info.node_id);
close_connection(&primary_conn);
termPQExpBuffer(&event_details);
return;
}
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
}
else
{
/*
* unable to connect to former primary - check if another node
* has been promoted
*/
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
local_node_info.upstream_node_id,
&sibling_nodes);
if (sibling_nodes.node_count > 0)
{
NodeInfoListCell *cell;
t_node_info *follow_node_info = NULL;
log_debug("scanning %i node records to detect new primary...", sibling_nodes.node_count);
for (cell = sibling_nodes.head; cell; cell = cell->next)
{
/* skip local node check, we did that above */
if (cell->node_info->node_id == local_node_info.node_id)
{
continue;
}
/* skip node if configured as a witness node - we can't possibly "follow" that */
if (cell->node_info->type == WITNESS)
{
continue;
}
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
close_connection(&cell->node_info->conn);
log_debug("unable to connect to %i ... ", cell->node_info->node_id);
close_connection(&cell->node_info->conn);
continue;
}
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
{
follow_node_info = cell->node_info;
close_connection(&cell->node_info->conn);
break;
}
close_connection(&cell->node_info->conn);
}
if (follow_node_info != NULL)
{
log_info(_("node \"%s\" (node ID: %i) detected as primary"),
follow_node_info->node_name,
follow_node_info->node_id);
witness_follow_new_primary(follow_node_info->node_id);
}
}
clear_node_info_list(&sibling_nodes);
}
}
loop:
/*
* handle local node failure
*
* currently we'll just check the connection, and try to reconnect
*
* TODO: add timeout, after which we run in degraded state
*/
(void) connection_ping(local_conn);
check_connection(&local_node_info, &local_conn);
if (PQstatus(local_conn) != CONNECTION_OK)
{
if (local_node_info.active == true)
{
bool success = true;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
local_node_info.active = false;
appendPQExpBuffer(&event_details,
_("unable to connect to local node \"%s\" (ID: %i), marking inactive"),
local_node_info.node_name,
local_node_info.node_id);
log_notice("%s", event_details.data);
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, false) == false)
{
success = false;
log_warning(_("unable to mark node \"%s\" (ID: %i) as inactive"),
local_node_info.node_name,
local_node_info.node_id);
}
}
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_failure",
success,
event_details.data);
termPQExpBuffer(&event_details);
}
}
else
{
/* we've reconnected to the local node after an outage */
if (local_node_info.active == false)
{
int stored_local_node_id = UNKNOWN_NODE_ID;
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
local_node_info.active = true;
appendPQExpBuffer(&event_details,
_("reconnected to local node \"%s\" (ID: %i), marking active"),
local_node_info.node_name,
local_node_info.node_id);
log_notice("%s", event_details.data);
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_recovery",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
}
/*
* If the local node was restarted, we'll need to reinitialise values
* stored in shared memory.
*/
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
if (stored_local_node_id == UNKNOWN_NODE_ID)
{
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
}
}
/*
* Refresh repmgr.nodes after "witness_sync_interval" seconds, and check if primary
* has changed
*/
if (PQstatus(primary_conn) == CONNECTION_OK)
{
int witness_sync_interval_elapsed = calculate_elapsed(witness_sync_interval_start);
if (witness_sync_interval_elapsed >= config_file_options.witness_sync_interval)
{
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
{
log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
close_connection(&primary_conn);
return;
}
log_debug("synchronising witness node records");
witness_copy_node_records(primary_conn, local_conn);
INSTR_TIME_SET_CURRENT(witness_sync_interval_start);
}
else
{
log_debug("seconds since last node record sync: %i (sync interval: %i)",
witness_sync_interval_elapsed,
config_file_options.witness_sync_interval)
}
}
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
{
PQExpBufferData monitoring_summary;
initPQExpBuffer(&monitoring_summary);
appendPQExpBuffer(&monitoring_summary,
_("witness node \"%s\" (ID: %i) monitoring primary node \"%s\" (ID: %i) in %s state"),
local_node_info.node_name,
local_node_info.node_id,
upstream_node_info.node_name,
upstream_node_info.node_id,
print_monitoring_state(monitoring_state));
log_info("%s", monitoring_summary.data);
termPQExpBuffer(&monitoring_summary);
if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
{
log_detail(_("waiting for current or new primary to reappear"));
}
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
if (got_SIGHUP)
{
handle_sighup(&local_conn, WITNESS);
}
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
}
return;
}
static bool
do_primary_failover(void)
{
ElectionResult election_result;
bool final_result = false;
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
int new_primary_id = UNKNOWN_NODE_ID;
bool standby_disconnect_on_failover = false;
/*
* Double-check status of the local connection
*/
check_connection(&local_node_info, &local_conn);
/*
* if requested, disable WAL receiver and wait until WAL receivers on all
* sibling nodes are disconnected
*/
if (config_file_options.standby_disconnect_on_failover == true)
{
/*
* Determine whether we can actually disable the walsender; this depends
* on PostgreSQL version and user permissions.
*/
standby_disconnect_on_failover = can_disable_walsender(local_conn);
if (standby_disconnect_on_failover == true)
{
NodeInfoListCell *cell = NULL;
NodeInfoList check_sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
int i;
bool sibling_node_wal_receiver_connected = false;
disable_wal_receiver(local_conn);
/*
* Loop through all reachable sibling nodes to determine whether
* they have disabled their WAL receivers.
*
* TODO: do_election() also calls get_active_sibling_node_records(),
* consolidate calls if feasible
*
*/
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
local_node_info.upstream_node_id,
&check_sibling_nodes);
for (i = 0; i < config_file_options.sibling_nodes_disconnect_timeout; i++)
{
for (cell = check_sibling_nodes.head; cell; cell = cell->next)
{
if (cell->node_info->conn == NULL)
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
log_warning(_("unable to query WAL receiver PID on node \"%s\" (ID: %i)"),
cell->node_info->node_name,
cell->node_info->node_id);
close_connection(&cell->node_info->conn);
}
else
{
pid_t sibling_wal_receiver_pid = (pid_t)get_wal_receiver_pid(cell->node_info->conn);
if (sibling_wal_receiver_pid == UNKNOWN_PID)
{
log_warning(_("unable to query WAL receiver PID on node %i"),
cell->node_info->node_id);
}
else if (sibling_wal_receiver_pid > 0)
{
log_info(_("WAL receiver PID on node %i is %i"),
cell->node_info->node_id,
sibling_wal_receiver_pid);
sibling_node_wal_receiver_connected = true;
}
}
}
if (sibling_node_wal_receiver_connected == false)
{
log_notice(_("WAL receiver disconnected on all sibling nodes"));
break;
}
log_debug("sleeping %i of max %i seconds (\"sibling_nodes_disconnect_timeout\")",
i + 1, config_file_options.sibling_nodes_disconnect_timeout);
sleep(1);
}
if (sibling_node_wal_receiver_connected == true)
{
/* TODO: prevent any such nodes becoming promotion candidates */
log_warning(_("WAL receiver still connected on at least one sibling node"));
}
else
{
log_info(_("WAL receiver disconnected on all %i sibling nodes"),
check_sibling_nodes.node_count);
}
clear_node_info_list(&check_sibling_nodes);
}
}
/* attempt to initiate voting process */
election_result = do_election(&sibling_nodes, &new_primary_id);
/* TODO add pre-event notification here */
failover_state = FAILOVER_STATE_UNKNOWN;
log_debug("election result: %s", _print_election_result(election_result));
/* Reenable WAL receiver, if disabled */
if (standby_disconnect_on_failover == true)
{
/* adjust "wal_retrieve_retry_interval" but don't wait for WAL receiver to start */
enable_wal_receiver(local_conn, false);
}
/* election was cancelled and do_election() did not determine a new primary */
if (election_result == ELECTION_CANCELLED)
{
if (new_primary_id == UNKNOWN_NODE_ID)
{
log_notice(_("election cancelled"));
clear_node_info_list(&sibling_nodes);
return false;
}
log_info(_("follower node intending to follow new primary %i"), new_primary_id);
failover_state = FAILOVER_STATE_FOLLOW_NEW_PRIMARY;
}
else if (election_result == ELECTION_RERUN)
{
log_notice(_("promotion candidate election will be rerun"));
/* notify siblings that they should rerun the election too */
notify_followers(&sibling_nodes, ELECTION_RERUN_NOTIFICATION);
failover_state = FAILOVER_STATE_ELECTION_RERUN;
}
else if (election_result == ELECTION_WON)
{
if (sibling_nodes.node_count > 0)
{
log_notice("this node is the winner, will now promote itself and inform other nodes");
}
else
{
log_notice("this node is the only available candidate and will now promote itself");
}
failover_state = promote_self();
}
else if (election_result == ELECTION_LOST || election_result == ELECTION_NOT_CANDIDATE)
{
/*
* if the node couldn't be promoted as it's not in the same location as the primary,
* add an explanatory notice
*/
if (election_result == ELECTION_NOT_CANDIDATE && strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) != 0)
{
log_notice(_("this node's location (\"%s\") is not the primary node location (\"%s\"), so node cannot be promoted"),
local_node_info.location,
upstream_node_info.location);
}
log_info(_("follower node awaiting notification from a candidate node"));
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
}
/*
* node has determined a new primary is already available
*/
if (failover_state == FAILOVER_STATE_FOLLOW_NEW_PRIMARY)
{
failover_state = follow_new_primary(new_primary_id);
}
/*
* node has decided it is a follower, so will await notification from the
* candidate that it has promoted itself and can be followed
*/
else if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
{
/* TODO: rerun election if new primary doesn't appear after timeout */
/* either follow, self-promote or time out; either way resume monitoring */
if (wait_primary_notification(&new_primary_id) == true)
{
/* if primary has reappeared, no action needed */
if (new_primary_id == upstream_node_info.node_id)
{
failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
}
/* if new_primary_id is self, promote */
else if (new_primary_id == local_node_info.node_id)
{
log_notice(_("this node is promotion candidate, promoting"));
failover_state = promote_self();
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
upstream_node_info.node_id,
&sibling_nodes);
}
/* election rerun */
else if (new_primary_id == ELECTION_RERUN_NOTIFICATION)
{
log_notice(_("received notification from promotion candidate to rerun election"));
failover_state = FAILOVER_STATE_ELECTION_RERUN;
}
else if (config_file_options.failover == FAILOVER_MANUAL)
{
/* automatic failover disabled */
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
new_primary_id);
}
else
{
PQExpBufferData event_details;
PGconn *new_primary_conn;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node \"%s\" (ID: %i) is in manual failover mode and is now disconnected from streaming replication"),
local_node_info.node_name,
local_node_info.node_id);
new_primary_conn = establish_db_connection(new_primary.conninfo, false);
create_event_notification(new_primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_disconnect_manual",
/*
* here "true" indicates the action has occurred as expected
*/
true,
event_details.data);
close_connection(&new_primary_conn);
termPQExpBuffer(&event_details);
}
failover_state = FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER;
}
else
{
failover_state = follow_new_primary(new_primary_id);
}
}
else
{
failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
}
}
log_verbose(LOG_DEBUG, "failover state is %s",
format_failover_state(failover_state));
switch (failover_state)
{
case FAILOVER_STATE_PROMOTED:
/* notify former siblings that they should now follow this node */
notify_followers(&sibling_nodes, local_node_info.node_id);
/* pass control back down to start_monitoring() */
log_info(_("switching to primary monitoring mode"));
failover_state = FAILOVER_STATE_NONE;
final_result = true;
break;
case FAILOVER_STATE_ELECTION_RERUN:
/* we no longer care about our former siblings */
clear_node_info_list(&sibling_nodes);
log_notice(_("rerunning election after %i seconds (\"election_rerun_interval\")"),
config_file_options.election_rerun_interval);
sleep(config_file_options.election_rerun_interval);
log_info(_("election rerun will now commence"));
/*
* mark the upstream node as "up" so another election is triggered
* after we fall back to monitoring
*/
upstream_node_info.node_status = NODE_STATUS_UP;
failover_state = FAILOVER_STATE_NONE;
final_result = false;
break;
case FAILOVER_STATE_PRIMARY_REAPPEARED:
/*
* notify siblings that they should resume following the original
* primary
*/
notify_followers(&sibling_nodes, upstream_node_info.node_id);
/* pass control back down to start_monitoring() */
log_info(_("resuming %s monitoring mode"), get_node_type_string(local_node_info.type));
log_detail(_("original primary \"%s\" (ID: %i) reappeared"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
final_result = true;
break;
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
log_info(_("resuming %s monitoring mode"), get_node_type_string(local_node_info.type));
log_detail(_("following new primary \"%s\" (ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
final_result = true;
break;
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
log_info(_("resuming %s monitoring mode"), get_node_type_string(local_node_info.type));
log_detail(_("following original primary \"%s\" (ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
final_result = true;
break;
case FAILOVER_STATE_PROMOTION_FAILED:
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
final_result = false;
break;
case FAILOVER_STATE_FOLLOW_FAIL:
/*
* for whatever reason we were unable to follow the new primary -
* continue monitoring in degraded state
*/
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
final_result = false;
break;
case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
log_info(_("automatic failover disabled for this node, manual intervention required"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
final_result = false;
break;
case FAILOVER_STATE_NO_NEW_PRIMARY:
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
/* pass control back down to start_monitoring() */
final_result = false;
break;
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
case FAILOVER_STATE_UNKNOWN:
case FAILOVER_STATE_NONE:
final_result = false;
break;
default: /* should never reach here */
log_warning(_("unhandled failover state %i"), failover_state);
break;
}
/* we no longer care about our former siblings */
clear_node_info_list(&sibling_nodes);
return final_result;
}
static bool
update_monitoring_history(void)
{
ReplInfo replication_info;
XLogRecPtr primary_last_wal_location = InvalidXLogRecPtr;
long long unsigned int apply_lag_bytes = 0;
long long unsigned int replication_lag_bytes = 0;
/* both local and primary connections must be available */
if (PQstatus(primary_conn) != CONNECTION_OK)
{
log_warning(_("primary connection is not available, unable to update monitoring history"));
return false;
}
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_warning(_("local connection is not available, unable to update monitoring history"));
return false;
}
init_replication_info(&replication_info);
if (get_replication_info(local_conn, STANDBY, &replication_info) == false)
{
log_warning(_("unable to retrieve replication status information, unable to update monitoring history"));
return false;
}
/*
* This can be the case when a standby is starting up after following
* a new primary, or when it has dropped back to archive recovery.
* As long as we can connect to the primary, we can still provide lag information.
*/
if (replication_info.receiving_streamed_wal == false)
{
log_verbose(LOG_WARNING, _("standby %i not connected to streaming replication"),
local_node_info.node_id);
}
primary_last_wal_location = get_primary_current_lsn(primary_conn);
if (primary_last_wal_location == InvalidXLogRecPtr)
{
log_warning(_("unable to retrieve primary's current LSN"));
return false;
}
/* calculate apply lag in bytes */
if (replication_info.last_wal_receive_lsn >= replication_info.last_wal_replay_lsn)
{
apply_lag_bytes = (long long unsigned int) (replication_info.last_wal_receive_lsn - replication_info.last_wal_replay_lsn);
}
else
{
/* if this happens, it probably indicates archive recovery */
apply_lag_bytes = 0;
}
/* calculate replication lag in bytes */
if (primary_last_wal_location >= replication_info.last_wal_receive_lsn)
{
replication_lag_bytes = (long long unsigned int) (primary_last_wal_location - replication_info.last_wal_receive_lsn);
log_debug("replication lag in bytes is: %llu", replication_lag_bytes);
}
else
{
/*
* This should never happen, but in case it does set replication lag
* to zero
*/
log_warning("primary xlog location (%X/%X) is behind the standby receive location (%X/%X)",
format_lsn(primary_last_wal_location),
format_lsn(replication_info.last_wal_receive_lsn));
replication_lag_bytes = 0;
}
add_monitoring_record(primary_conn,
local_conn,
primary_node_id,
local_node_info.node_id,
replication_info.current_timestamp,
primary_last_wal_location,
replication_info.last_wal_receive_lsn,
replication_info.last_xact_replay_timestamp,
replication_lag_bytes,
apply_lag_bytes);
INSTR_TIME_SET_CURRENT(last_monitoring_update);
log_verbose(LOG_DEBUG, "update_monitoring_history(): monitoring history update sent");
return true;
}
/*
* do_upstream_standby_failover()
*
* Attach cascaded standby to another node, currently the primary.
*
* Note that in contrast to a primary failover, where one of the downstream
* standby nodes will become a primary, a cascaded standby failover (where the
* upstream standby has gone away) is "just" a case of attaching the standby to
* another node.
*
* Currently we will try to attach the node to the cluster primary.
*
* TODO: As of repmgr 4.3, "repmgr standby follow" supports attaching a standby to another
* standby node. We need to provide a selection of reconnection strategies as different
* behaviour might be desirable in different situations.
*/
static bool
do_upstream_standby_failover(void)
{
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
RecoveryType primary_type = RECTYPE_UNKNOWN;
int i, standby_follow_result;
char parsed_follow_command[MAXPGPATH] = "";
close_connection(&upstream_conn);
/*
*
*/
if (config_file_options.failover == FAILOVER_MANUAL)
{
log_notice(_("this node is not configured for automatic failover"));
log_detail(_("parameter \"failover\" is set to \"manual\""));
return false;
}
if (get_primary_node_record(local_conn, &primary_node_info) == false)
{
log_error(_("unable to retrieve primary node record"));
return false;
}
/*
* Verify that we can still talk to the cluster primary, even though the
* node's upstream is not available
*/
check_connection(&primary_node_info, &primary_conn);
if (PQstatus(primary_conn) != CONNECTION_OK)
{
log_error(_("unable to connect to last known primary \"%s\" (ID: %i)"),
primary_node_info.node_name,
primary_node_info.node_id);
close_connection(&primary_conn);
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
}
primary_type = get_recovery_type(primary_conn);
if (primary_type != RECTYPE_PRIMARY)
{
if (primary_type == RECTYPE_STANDBY)
{
log_error(_("last known primary \"%s\" (ID: %i) is in recovery, not following"),
primary_node_info.node_name,
primary_node_info.node_id);
}
else
{
log_error(_("unable to determine status of last known primary \"%s\" (ID: %i), not following"),
primary_node_info.node_name,
primary_node_info.node_id);
}
close_connection(&primary_conn);
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
}
/* Close the connection to this server */
close_connection(&local_conn);
log_debug(_("standby follow command is:\n \"%s\""),
config_file_options.follow_command);
/*
* replace %n in "config_file_options.follow_command" with ID of primary
* to follow.
*/
parse_follow_command(parsed_follow_command, config_file_options.follow_command, primary_node_info.node_id);
standby_follow_result = system(parsed_follow_command);
if (standby_follow_result != 0)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("unable to execute follow command:\n %s"),
config_file_options.follow_command);
log_error("%s", event_details.data);
/*
* It may not possible to write to the event notification table but we
* should be able to generate an external notification if required.
*/
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
termPQExpBuffer(&event_details);
}
/*
* It's possible that the standby is still starting up after the "follow_command"
* completes, so poll for a while until we get a connection.
*
* NOTE: we've previously closed the local connection, so even if the follow command
* failed for whatever reason and the local node remained up, we can re-open
* the local connection.
*/
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
{
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) == CONNECTION_OK)
break;
close_connection(&local_conn);
log_debug("sleeping 1 second; %i of %i (\"repmgrd_standby_startup_timeout\") attempts to reconnect to local node",
i + 1,
config_file_options.repmgrd_standby_startup_timeout);
sleep(1);
}
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_error(_("unable to reconnect to local node \"%s\" (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* refresh shared memory settings which will have been zapped by the restart */
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
/*
*
*/
if (standby_follow_result != 0)
{
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/*
* update upstream_node_id to primary node (but only if follow command
* was successful)
*/
{
if (update_node_record_set_upstream(primary_conn,
local_node_info.node_id,
primary_node_info.node_id) == false)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("unable to set node \"%s\" (ID: %i)'s new upstream ID to %i"),
local_node_info.node_name,
local_node_info.node_id,
primary_node_info.node_id);
log_error("%s", event_details.data);
create_event_notification(NULL,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
termPQExpBuffer(&event_details);
terminate(ERR_BAD_CONFIG);
}
}
/* refresh own internal node record */
record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
/*
* highly improbable this will happen, but in case we're unable to
* retrieve our node record from the primary, update it ourselves, and
* hope for the best
*/
if (record_status != RECORD_FOUND)
{
local_node_info.upstream_node_id = primary_node_info.node_id;
}
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node \"%s\" (ID: %i) is now following primary node \"%s\" (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id,
primary_node_info.node_name,
primary_node_info.node_id);
log_notice("%s", event_details.data);
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
/* keep the primary connection open */
return true;
}
/*
* This promotes the local node using the "promote_command" configuration
* parameter, which must be either "repmgr standby promote" or a script which
* at some point executes "repmgr standby promote".
*
* TODO: make "promote_command" and execute the same code used by
* "repmgr standby promote".
*/
static FailoverState
promote_self(void)
{
char *promote_command;
int r;
/* Store details of the failed node here */
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status;
/*
* optionally add a delay before promoting the standby; this is mainly
* useful for testing (e.g. for reappearance of the original primary) and
* is not documented.
*/
if (config_file_options.promote_delay > 0)
{
log_debug("sleeping %i seconds before promoting standby",
config_file_options.promote_delay);
sleep(config_file_options.promote_delay);
}
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
{
/*
* This is a corner-case situation where the repmgr metadata on the
* promotion candidate is outdated and the local node's upstream_node_id
* is not set. This is often an indication of potentially serious issues,
* such as the local node being very far behind the primary, or not being
* attached at all.
*
* In this case it may be desirable to restore the original primary.
* This behaviour can be controlled by the "always_promote" configuration option.
*/
if (config_file_options.always_promote == false)
{
log_error(_("this node (ID: %i) does not have its upstream_node_id set, not promoting"),
local_node_info.node_id);
log_detail(_("the local node's metadata has not been updated since it became a standby"));
log_hint(_("set \"always_promote\" to \"true\" to force promotion in this situation"));
return FAILOVER_STATE_PROMOTION_FAILED;
}
else
{
log_warning(_("this node (ID: %i) does not have its upstream_node_id set, promoting anyway"),
local_node_info.node_id);
log_detail(_("\"always_promote\" is set to \"true\" "));
}
}
else
{
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
local_node_info.upstream_node_id);
return FAILOVER_STATE_PROMOTION_FAILED;
}
}
/* the presence of this command has been established already */
promote_command = config_file_options.promote_command;
log_info(_("promote_command is:\n \"%s\""),
promote_command);
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
{
fflush(stderr);
}
r = system(promote_command);
log_debug("result of promote_command: %i", WEXITSTATUS(r));
/* connection should stay up, but check just in case */
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_warning(_("local database connection not available"));
log_detail("\n%s", PQerrorMessage(local_conn));
close_connection(&local_conn);
local_conn = establish_db_connection(local_node_info.conninfo, true);
/* assume node failed */
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_error(_("unable to reconnect to local node"));
log_detail("\n%s", PQerrorMessage(local_conn));
close_connection(&local_conn);
/* XXX handle this */
return FAILOVER_STATE_LOCAL_NODE_FAILURE;
}
}
if (WIFEXITED(r) && WEXITSTATUS(r))
{
int primary_node_id = UNKNOWN_NODE_ID;
log_error(_("promote command failed"));
log_detail(_("promote command exited with error code %i"), WEXITSTATUS(r));
log_info(_("checking if original primary node has reappeared"));
upstream_conn = get_primary_connection(local_conn,
&primary_node_id,
NULL);
if (PQstatus(upstream_conn) != CONNECTION_OK)
{
close_connection(&upstream_conn);
}
else if (primary_node_id == failed_primary.node_id)
{
PQExpBufferData event_details;
log_notice(_("original primary \"%s\" (ID: %i) reappeared before this standby was promoted - no action taken"),
failed_primary.node_name,
failed_primary.node_id);
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("original primary \"%s\" (ID: %i) reappeared"),
failed_primary.node_name,
failed_primary.node_id);
create_event_notification(upstream_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_abort",
true,
event_details.data);
termPQExpBuffer(&event_details);
/* XXX handle this! */
/* -> we'll need to let the other nodes know too.... */
/* no failover occurred but we'll want to restart connections */
return FAILOVER_STATE_PRIMARY_REAPPEARED;
}
create_event_notification(NULL,
&config_file_options,
local_node_info.node_id,
"repmgrd_promote_error",
true,
"");
return FAILOVER_STATE_PROMOTION_FAILED;
}
/*
* Promotion has succeeded - verify local connection is still available
*/
try_reconnect(&local_conn, &local_node_info);
/* bump the electoral term */
increment_current_term(local_conn);
{
PQExpBufferData event_details;
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
/* update own internal node record */
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
/*
* XXX here we're assuming the promote command updated metadata
*/
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node \"%s\" (ID: %i) promoted to primary; old primary \"%s\" (ID: %i) marked as failed"),
local_node_info.node_name,
local_node_info.node_id,
failed_primary.node_name,
failed_primary.node_id);
event_info.node_id = failed_primary.node_id;
/* local_conn is now the primary connection */
create_event_notification_extended(local_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_promote",
true,
event_details.data,
&event_info);
termPQExpBuffer(&event_details);
}
return FAILOVER_STATE_PROMOTED;
}
/*
* Notify follower nodes about which node to follow. Normally this
* will be the current node, however if the original primary reappeared
* before this node could be promoted, we'll inform the followers they
* should resume monitoring the original primary.
*/
static void
notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
{
NodeInfoListCell *cell;
log_info(_("%i followers to notify"),
standby_nodes->node_count);
for (cell = standby_nodes->head; cell; cell = cell->next)
{
log_verbose(LOG_DEBUG, "intending to notify node %i...", cell->node_info->node_id);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
log_info(_("reconnecting to node \"%s\" (ID: %i)..."),
cell->node_info->node_name,
cell->node_info->node_id);
close_connection(&cell->node_info->conn);
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
}
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
log_warning(_("unable to reconnect to \"%s\" (ID: %i)"),
cell->node_info->node_name,
cell->node_info->node_id);
log_detail("\n%s", PQerrorMessage(cell->node_info->conn));
close_connection(&cell->node_info->conn);
continue;
}
if (follow_node_id == ELECTION_RERUN_NOTIFICATION)
{
log_notice(_("notifying node \"%s\" (ID: %i) to rerun promotion candidate selection"),
cell->node_info->node_name,
cell->node_info->node_id);
}
else
{
log_notice(_("notifying node \"%s\" (ID: %i) to follow node %i"),
cell->node_info->node_name,
cell->node_info->node_id,
follow_node_id);
}
notify_follow_primary(cell->node_info->conn, follow_node_id);
}
}
static bool
wait_primary_notification(int *new_primary_id)
{
int i;
for (i = 0; i < config_file_options.primary_notification_timeout; i++)
{
if (get_new_primary(local_conn, new_primary_id) == true)
{
log_debug("new primary is %i; elapsed: %i seconds",
*new_primary_id, i);
return true;
}
log_verbose(LOG_DEBUG, "waiting for new primary notification, %i of max %i seconds (\"primary_notification_timeout\")",
i, config_file_options.primary_notification_timeout);
sleep(1);
}
log_warning(_("no notification received from new primary after %i seconds"),
config_file_options.primary_notification_timeout);
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
}
static FailoverState
follow_new_primary(int new_primary_id)
{
char parsed_follow_command[MAXPGPATH] = "";
int i, r;
/* Store details of the failed node here */
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
bool new_primary_ok = false;
log_verbose(LOG_DEBUG, "follow_new_primary(): new primary id is %i", new_primary_id);
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
log_notice(_("attempting to follow new primary \"%s\" (node ID: %i)"),
new_primary.node_name,
new_primary_id);
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
local_node_info.upstream_node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* XXX check if new_primary_id == failed_primary.node_id? */
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
{
fflush(stderr);
}
upstream_conn = establish_db_connection(new_primary.conninfo, false);
if (PQstatus(upstream_conn) == CONNECTION_OK)
{
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
if (primary_recovery_type == RECTYPE_PRIMARY)
{
new_primary_ok = true;
}
else
{
new_primary_ok = false;
log_warning(_("new primary \"%s\" (node ID: %i) is in recovery"),
new_primary.node_name,
new_primary_id);
close_connection(&upstream_conn);
}
}
if (new_primary_ok == false)
{
return FAILOVER_STATE_FOLLOW_FAIL;
}
/*
* disconnect from local node, as follow operation will result in a server
* restart
*/
close_connection(&local_conn);
/*
* replace %n in "config_file_options.follow_command" with ID of primary
* to follow.
*/
parse_follow_command(parsed_follow_command, config_file_options.follow_command, new_primary_id);
log_debug(_("standby follow command is:\n \"%s\""),
parsed_follow_command);
/* execute the follow command */
r = system(parsed_follow_command);
if (r != 0)
{
PGconn *old_primary_conn;
/*
* The "standby follow" command could still fail due to the original primary
* reappearing before the candidate could promote itself ("repmgr
* standby follow" will refuse to promote another node if the primary
* is available). However the new primary will only instruct the other
* nodes to follow it after it's successfully promoted itself, so this
* case is highly unlikely. A slightly more likely scenario would
* be the new primary becoming unavailable just after it's sent notifications
* to its follower nodes, and the old primary becoming available again.
*/
old_primary_conn = establish_db_connection(failed_primary.conninfo, false);
if (PQstatus(old_primary_conn) == CONNECTION_OK)
{
RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
if (upstream_recovery_type == RECTYPE_PRIMARY)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBufferStr(&event_details,
_("original primary reappeared - no action taken"));
log_notice("%s", event_details.data);
create_event_notification(old_primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_aborted",
true,
event_details.data);
termPQExpBuffer(&event_details);
close_connection(&upstream_conn);
close_connection(&old_primary_conn);
return FAILOVER_STATE_PRIMARY_REAPPEARED;
}
log_notice(_("original primary reappeared as standby"));
close_connection(&old_primary_conn);
}
close_connection(&upstream_conn);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/*
* refresh local copy of local and primary node records - we get these
* directly from the primary to ensure they're the current version
*/
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record found for node %i"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record found for node %i"),
local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/*
* It's possible that the standby is still starting up after the "follow_command"
* completes, so poll for a while until we get a connection.
*/
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
{
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) == CONNECTION_OK)
break;
close_connection(&local_conn);
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
i + 1,
config_file_options.repmgrd_standby_startup_timeout);
sleep(1);
}
if (local_conn == NULL || PQstatus(local_conn) != CONNECTION_OK)
{
log_error(_("unable to reconnect to local node \"%s\" (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* refresh shared memory settings which will have been zapped by the restart */
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node \"%s\" (ID: %i) now following new upstream node \"%s\" (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id,
upstream_node_info.node_name,
upstream_node_info.node_id);
log_notice("%s", event_details.data);
create_event_notification(upstream_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
}
static FailoverState
witness_follow_new_primary(int new_primary_id)
{
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
bool new_primary_ok = false;
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* TODO: check if new_primary_id == failed_primary.node_id? */
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
{
fflush(stderr);
}
upstream_conn = establish_db_connection(new_primary.conninfo, false);
if (PQstatus(upstream_conn) == CONNECTION_OK)
{
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
switch (primary_recovery_type)
{
case RECTYPE_PRIMARY:
new_primary_ok = true;
break;
case RECTYPE_STANDBY:
new_primary_ok = false;
log_warning(_("new primary \"%s\" (node ID: %i) is in recovery"),
new_primary.node_name,
new_primary_id);
break;
case RECTYPE_UNKNOWN:
new_primary_ok = false;
log_warning(_("unable to determine status of new primary"));
break;
}
}
if (new_primary_ok == false)
{
close_connection(&upstream_conn);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* set new upstream node ID on primary */
update_node_record_set_upstream(upstream_conn, local_node_info.node_id, new_primary_id);
witness_copy_node_records(upstream_conn, local_conn);
/*
* refresh local copy of local and primary node records - we get these
* directly from the primary to ensure they're the current version
*/
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record found for node %i"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for node %i"),
local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("witness node \"%s\" (ID: %i) now following new primary node \"%s\" (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id,
upstream_node_info.node_name,
upstream_node_info.node_id);
log_notice("%s", event_details.data);
create_event_notification(upstream_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
termPQExpBuffer(&event_details);
}
return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
}
static const char *
_print_election_result(ElectionResult result)
{
switch (result)
{
case ELECTION_NOT_CANDIDATE:
return "NOT CANDIDATE";
case ELECTION_WON:
return "WON";
case ELECTION_LOST:
return "LOST";
case ELECTION_CANCELLED:
return "CANCELLED";
case ELECTION_RERUN:
return "RERUN";
}
/* should never reach here */
return "UNKNOWN";
}
/*
* Failover decision for nodes attached to the current primary.
*
* NB: this function sets "sibling_nodes"; caller (do_primary_failover)
* expects to be able to read this list
*/
static ElectionResult
do_election(NodeInfoList *sibling_nodes, int *new_primary_id)
{
int electoral_term = -1;
NodeInfoListCell *cell = NULL;
t_node_info *candidate_node = NULL;
election_stats stats;
ReplInfo local_replication_info;
/* To collate details of nodes with primary visible for logging purposes */
PQExpBufferData nodes_with_primary_visible;
/*
* Check if at least one server in the primary's location is visible; if
* not we'll assume a network split between this node and the primary
* location, and not promote any standby.
*
* NOTE: this function is only ever called by standbys attached to the
* current (unreachable) primary, so "upstream_node_info" will always
* contain the primary node record.
*/
bool primary_location_seen = false;
int nodes_with_primary_still_visible = 0;
if (config_file_options.failover_delay > 0)
{
log_debug("sleeping %i seconds (\"failover_delay\") before initiating failover",
config_file_options.failover_delay);
sleep(config_file_options.failover_delay);
}
/* we're visible */
stats.visible_nodes = 1;
stats.shared_upstream_nodes = 0;
stats.all_nodes = 0;
electoral_term = get_current_term(local_conn);
if (electoral_term == -1)
{
log_error(_("unable to determine electoral term"));
return ELECTION_NOT_CANDIDATE;
}
log_debug("do_election(): electoral term is %i", electoral_term);
if (config_file_options.failover == FAILOVER_MANUAL)
{
log_notice(_("this node is not configured for automatic failover so will not be considered as promotion candidate, and will not follow the new primary"));
log_detail(_("\"failover\" is set to \"manual\" in repmgr.conf"));
log_hint(_("manually execute \"repmgr standby follow\" to have this node follow the new primary"));
return ELECTION_NOT_CANDIDATE;
}
/* node priority is set to zero - don't become a candidate, and lose by default */
if (local_node_info.priority <= 0)
{
log_notice(_("this node's priority is %i so will not be considered as an automatic promotion candidate"),
local_node_info.priority);
return ELECTION_LOST;
}
/* get all active nodes attached to upstream, excluding self */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
upstream_node_info.node_id,
sibling_nodes);
log_info(_("%i active sibling nodes registered"), sibling_nodes->node_count);
stats.shared_upstream_nodes = sibling_nodes->node_count + 1;
get_all_nodes_count(local_conn, &stats.all_nodes);
log_info(_("%i total nodes registered"), stats.all_nodes);
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) != 0)
{
log_info(_("primary node \"%s\" (ID: %i) has location \"%s\", this node's location is \"%s\""),
upstream_node_info.node_name,
upstream_node_info.node_id,
upstream_node_info.location,
local_node_info.location);
}
else
{
log_info(_("primary node \"%s\" (ID: %i) and this node have the same location (\"%s\")"),
upstream_node_info.node_name,
upstream_node_info.node_id,
local_node_info.location);
}
local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;
/* fast path if no other standbys (or witness) exists - normally win by default */
if (sibling_nodes->node_count == 0)
{
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
{
if (config_file_options.failover_validation_command[0] != '\0')
{
return execute_failover_validation_command(&local_node_info, &stats);
}
log_info(_("no other sibling nodes - we win by default"));
return ELECTION_WON;
}
else
{
/*
* If primary and standby have different locations set, the assumption
* is that no action should be taken as we can't tell whether there's
* been a network interruption or not.
*
* Normally a situation with primary and standby in different physical
* locations would be handled by leaving the location as "default" and
* setting up a witness server in the primary's location.
*/
log_debug("no other nodes, but primary and standby locations differ");
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return ELECTION_NOT_CANDIDATE;
}
}
else
{
/* standby nodes found - check if we're in the primary location before checking theirs */
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
{
primary_location_seen = true;
}
}
/* get our lsn */
if (get_replication_info(local_conn, STANDBY, &local_replication_info) == false)
{
log_error(_("unable to retrieve replication information for local node"));
return ELECTION_LOST;
}
/* check if WAL replay on local node is paused */
if (local_replication_info.wal_replay_paused == true)
{
log_debug("WAL replay is paused");
if (local_replication_info.last_wal_receive_lsn > local_replication_info.last_wal_replay_lsn)
{
log_warning(_("WAL replay on this node is paused and WAL is pending replay"));
log_detail(_("replay paused at %X/%X; last WAL received is %X/%X"),
format_lsn(local_replication_info.last_wal_replay_lsn),
format_lsn(local_replication_info.last_wal_receive_lsn));
}
/* attempt to resume WAL replay - unlikely this will fail, but just in case */
if (resume_wal_replay(local_conn) == false)
{
log_error(_("unable to resume WAL replay"));
log_detail(_("this node cannot be reliably promoted"));
return ELECTION_LOST;
}
log_notice(_("WAL replay forcibly resumed"));
}
local_node_info.last_wal_receive_lsn = local_replication_info.last_wal_receive_lsn;
log_info(_("local node's last receive lsn: %X/%X"), format_lsn(local_node_info.last_wal_receive_lsn));
/* pointer to "winning" node, initially self */
candidate_node = &local_node_info;
initPQExpBuffer(&nodes_with_primary_visible);
for (cell = sibling_nodes->head; cell; cell = cell->next)
{
ReplInfo sibling_replication_info;
log_info(_("checking state of sibling node \"%s\" (ID: %i)"),
cell->node_info->node_name,
cell->node_info->node_id);
/* assume the worst case */
cell->node_info->node_status = NODE_STATUS_UNKNOWN;
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
close_connection(&cell->node_info->conn);
continue;
}
cell->node_info->node_status = NODE_STATUS_UP;
stats.visible_nodes++;
/*
* see if the node is in the primary's location (but skip the check if
* we've seen a node there already)
*/
if (primary_location_seen == false)
{
if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
{
log_debug("node %i in primary location \"%s\"",
cell->node_info->node_id,
cell->node_info->location);
primary_location_seen = true;
}
}
/*
* check if repmgrd running - skip if not
*
* TODO: include pid query in replication info query?
*
* NOTE: from Pg12 we could execute "pg_promote()" from a running repmgrd;
* here we'll need to find a way of ensuring only one repmgrd does this
*/
if (repmgrd_get_pid(cell->node_info->conn) == UNKNOWN_PID)
{
log_warning(_("repmgrd not running on node \"%s\" (ID: %i), skipping"),
cell->node_info->node_name,
cell->node_info->node_id);
continue;
}
if (get_replication_info(cell->node_info->conn, cell->node_info->type, &sibling_replication_info) == false)
{
log_warning(_("unable to retrieve replication information for node \"%s\" (ID: %i), skipping"),
cell->node_info->node_name,
cell->node_info->node_id);
continue;
}
/*
* Check if node is not in recovery - it may have been promoted
* outside of the failover mechanism, in which case we may be able
* to follow it.
*/
if (sibling_replication_info.in_recovery == false && cell->node_info->type != WITNESS)
{
bool can_follow;
log_warning(_("node \"%s\" (ID: %i) is not in recovery"),
cell->node_info->node_name,
cell->node_info->node_id);
/*
* Node is not in recovery, but still reporting an upstream
* node ID; possible it was promoted manually (e.g. with "pg_ctl promote"),
* or (less likely) the node's repmgrd has just switched to primary
* monitoring node but has not yet unset the upstream node ID in
* shared memory. Either way, log this.
*/
if (sibling_replication_info.upstream_node_id != UNKNOWN_NODE_ID)
{
log_warning(_("node \"%s\" (ID: %i) still reports its upstream is node %i, last seen %i second(s) ago"),
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_node_id,
sibling_replication_info.upstream_last_seen);
}
can_follow = check_node_can_follow(local_conn,
local_node_info.last_wal_receive_lsn,
cell->node_info->conn,
cell->node_info);
if (can_follow == true)
{
*new_primary_id = cell->node_info->node_id;
termPQExpBuffer(&nodes_with_primary_visible);
return ELECTION_CANCELLED;
}
/*
* Tricky situation here - we'll assume the node is a rogue primary
*/
log_warning(_("not possible to attach to node \"%s\" (ID: %i), ignoring"),
cell->node_info->node_name,
cell->node_info->node_id);
continue;
}
else
{
log_info(_("node \"%s\" (ID: %i) reports its upstream is node %i, last seen %i second(s) ago"),
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_node_id,
sibling_replication_info.upstream_last_seen);
}
/* check if WAL replay on node is paused */
if (sibling_replication_info.wal_replay_paused == true)
{
/*
* Theoretically the repmgrd on the node should have resumed WAL play
* at this point.
*/
if (sibling_replication_info.last_wal_receive_lsn > sibling_replication_info.last_wal_replay_lsn)
{
log_warning(_("WAL replay on node \"%s\" (ID: %i) is paused and WAL is pending replay"),
cell->node_info->node_name,
cell->node_info->node_id);
}
}
/*
* Check if node has seen primary "recently" - if so, we may have "partial primary visibility".
* For now we'll assume the primary is visible if it's been seen less than
* monitor_interval_secs * 2 seconds ago. We may need to adjust this, and/or make the value
* configurable.
*/
if (sibling_replication_info.upstream_last_seen >= 0 && sibling_replication_info.upstream_last_seen < (config_file_options.monitor_interval_secs * 2))
{
if (sibling_replication_info.upstream_node_id != upstream_node_info.node_id)
{
log_warning(_("assumed sibling node \"%s\" (ID: %i) monitoring different upstream node %i"),
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_node_id);
}
else
{
nodes_with_primary_still_visible++;
log_notice(_("%s node \"%s\" (ID: %i) last saw primary node %i second(s) ago, considering primary still visible"),
get_node_type_string(cell->node_info->type),
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
appendPQExpBuffer(&nodes_with_primary_visible,
" - node \"%s\" (ID: %i): %i second(s) ago\n",
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
}
}
else
{
log_info(_("%s node \"%s\" (ID: %i) last saw primary node %i second(s) ago"),
get_node_type_string(cell->node_info->type),
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
}
/* don't interrogate a witness server */
if (cell->node_info->type == WITNESS)
{
log_debug("node %i is witness, not querying state", cell->node_info->node_id);
continue;
}
/* don't check 0-priority nodes */
if (cell->node_info->priority <= 0)
{
log_info(_("node \"%s\" (ID: %i) has priority of %i, skipping"),
cell->node_info->node_name,
cell->node_info->node_id,
cell->node_info->priority);
continue;
}
/* get node's last receive LSN - if "higher" than current winner, current node is candidate */
cell->node_info->last_wal_receive_lsn = sibling_replication_info.last_wal_receive_lsn;
log_info(_("last receive LSN for sibling node \"%s\" (ID: %i) is: %X/%X"),
cell->node_info->node_name,
cell->node_info->node_id,
format_lsn(cell->node_info->last_wal_receive_lsn));
/* compare LSN */
if (cell->node_info->last_wal_receive_lsn > candidate_node->last_wal_receive_lsn)
{
/* other node is ahead */
log_info(_("node \"%s\" (ID: %i) is ahead of current candidate \"%s\" (ID: %i)"),
cell->node_info->node_name,
cell->node_info->node_id,
candidate_node->node_name,
candidate_node->node_id);
candidate_node = cell->node_info;
}
/* LSN is same - tiebreak on priority, then node_id */
else if (cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn)
{
log_info(_("node \"%s\" (ID: %i) has same LSN as current candidate \"%s\" (ID: %i)"),
cell->node_info->node_name,
cell->node_info->node_id,
candidate_node->node_name,
candidate_node->node_id);
if (cell->node_info->priority > candidate_node->priority)
{
log_info(_("node \"%s\" (ID: %i) has higher priority (%i) than current candidate \"%s\" (ID: %i) (%i)"),
cell->node_info->node_name,
cell->node_info->node_id,
cell->node_info->priority,
candidate_node->node_name,
candidate_node->node_id,
candidate_node->priority);
candidate_node = cell->node_info;
}
else if (cell->node_info->priority == candidate_node->priority)
{
if (cell->node_info->node_id < candidate_node->node_id)
{
log_info(_("node \"%s\" (ID: %i) has same priority but lower node_id than current candidate \"%s\" (ID: %i)"),
cell->node_info->node_name,
cell->node_info->node_id,
candidate_node->node_name,
candidate_node->node_id);
candidate_node = cell->node_info;
}
}
else
{
log_info(_("node \"%s\" (ID: %i) has lower priority (%i) than current candidate \"%s\" (ID: %i) (%i)"),
cell->node_info->node_name,
cell->node_info->node_id,
cell->node_info->priority,
candidate_node->node_name,
candidate_node->node_id,
candidate_node->priority);
}
}
}
if (primary_location_seen == false)
{
log_notice(_("no nodes from the primary location \"%s\" visible - assuming network split"),
upstream_node_info.location);
log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
reset_node_voting_status();
termPQExpBuffer(&nodes_with_primary_visible);
return ELECTION_CANCELLED;
}
if (nodes_with_primary_still_visible > 0)
{
log_info(_("%i nodes can see the primary"),
nodes_with_primary_still_visible);
log_detail(_("following nodes can see the primary:\n%s"),
nodes_with_primary_visible.data);
if (config_file_options.primary_visibility_consensus == true)
{
log_notice(_("cancelling failover as some nodes can still see the primary"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
reset_node_voting_status();
termPQExpBuffer(&nodes_with_primary_visible);
return ELECTION_CANCELLED;
}
}
termPQExpBuffer(&nodes_with_primary_visible);
log_info(_("visible nodes: %i; total nodes: %i; no nodes have seen the primary within the last %i seconds"),
stats.visible_nodes,
stats.shared_upstream_nodes,
(config_file_options.monitor_interval_secs * 2));
if (stats.visible_nodes <= (stats.shared_upstream_nodes / 2.0))
{
log_notice(_("unable to reach a qualified majority of nodes"));
log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
reset_node_voting_status();
return ELECTION_CANCELLED;
}
log_notice(_("promotion candidate is \"%s\" (ID: %i)"),
candidate_node->node_name,
candidate_node->node_id);
if (candidate_node->node_id == local_node_info.node_id)
{
/*
* If "failover_validation_command" is set, execute that command
* and decide the result based on the command's output
*/
if (config_file_options.failover_validation_command[0] != '\0')
{
return execute_failover_validation_command(candidate_node, &stats);
}
return ELECTION_WON;
}
return ELECTION_LOST;
}
/*
* "failover" for the witness node; the witness has no part in the election
* other than being reachable, so just needs to await notification from the
* new primary
*/
static
bool do_witness_failover(void)
{
int new_primary_id = UNKNOWN_NODE_ID;
/* TODO add pre-event notification here */
failover_state = FAILOVER_STATE_UNKNOWN;
if (wait_primary_notification(&new_primary_id) == true)
{
/* if primary has reappeared, no action needed */
if (new_primary_id == upstream_node_info.node_id)
{
failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
}
else
{
failover_state = witness_follow_new_primary(new_primary_id);
}
}
else
{
failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
}
log_verbose(LOG_DEBUG, "failover state is %s",
format_failover_state(failover_state));
switch (failover_state)
{
case FAILOVER_STATE_PRIMARY_REAPPEARED:
/* pass control back down to start_monitoring() */
log_info(_("resuming %s monitoring mode"),get_node_type_string(local_node_info.type));
log_detail(_("original primary \"%s\" (ID: %i) reappeared"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
return true;
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
log_info(_("resuming %s monitoring mode"),get_node_type_string(local_node_info.type));
log_detail(_("following new primary \"%s\" (ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
return true;
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
log_info(_("resuming %s monitoring mode"),get_node_type_string(local_node_info.type));
log_detail(_("following original primary \"%s\" (ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
return true;
case FAILOVER_STATE_FOLLOW_FAIL:
/*
* for whatever reason we were unable to follow the new primary -
* continue monitoring in degraded state
*/
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
default:
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
}
/* should never reach here */
return false;
}
static void
reset_node_voting_status(void)
{
failover_state = FAILOVER_STATE_NONE;
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_error(_("reset_node_voting_status(): local_conn not set"));
log_detail("\n%s", PQerrorMessage(local_conn));
return;
}
reset_voting_status(local_conn);
}
static void
check_connection(t_node_info *node_info, PGconn **conn)
{
if (is_server_available(node_info->conninfo) == false)
{
log_warning(_("connection to node \"%s\" (ID: %i) lost"),
node_info->node_name,
node_info->node_id);
log_detail("\n%s", PQerrorMessage(*conn));
close_connection(conn);
}
if (PQstatus(*conn) != CONNECTION_OK)
{
log_info(_("attempting to reconnect to node \"%s\" (ID: %i)"),
node_info->node_name,
node_info->node_id);
close_connection(conn);
*conn = establish_db_connection(node_info->conninfo, false);
if (PQstatus(*conn) != CONNECTION_OK)
{
close_connection(conn);
log_warning(_("reconnection to node \"%s\" (ID: %i) failed"),
node_info->node_name,
node_info->node_id);
}
else
{
int stored_local_node_id = UNKNOWN_NODE_ID;
log_info(_("reconnected to node \"%s\" (ID: %i)"),
node_info->node_name,
node_info->node_id);
stored_local_node_id = repmgrd_get_local_node_id(*conn);
if (stored_local_node_id == UNKNOWN_NODE_ID)
{
repmgrd_set_local_node_id(*conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
}
}
}
static const char *
format_failover_state(FailoverState failover_state)
{
switch(failover_state)
{
case FAILOVER_STATE_UNKNOWN:
return "UNKNOWN";
case FAILOVER_STATE_NONE:
return "NONE";
case FAILOVER_STATE_PROMOTED:
return "PROMOTED";
case FAILOVER_STATE_PROMOTION_FAILED:
return "PROMOTION_FAILED";
case FAILOVER_STATE_PRIMARY_REAPPEARED:
return "PRIMARY_REAPPEARED";
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
return "LOCAL_NODE_FAILURE";
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
return "WAITING_NEW_PRIMARY";
case FAILOVER_STATE_FOLLOW_NEW_PRIMARY:
return "FOLLOW_NEW_PRIMARY";
case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
return "REQUIRES_MANUAL_FAILOVER";
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
return "FOLLOWED_NEW_PRIMARY";
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
return "FOLLOWING_ORIGINAL_PRIMARY";
case FAILOVER_STATE_NO_NEW_PRIMARY:
return "NO_NEW_PRIMARY";
case FAILOVER_STATE_FOLLOW_FAIL:
return "FOLLOW_FAIL";
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
return "NODE_NOTIFICATION_ERROR";
case FAILOVER_STATE_ELECTION_RERUN:
return "ELECTION_RERUN";
}
/* should never reach here */
return "UNKNOWN_FAILOVER_STATE";
}
static void
handle_sighup(PGconn **conn, t_server_type server_type)
{
log_notice(_("received SIGHUP, reloading configuration"));
if (reload_config(server_type))
{
close_connection(conn);
*conn = establish_db_connection(config_file_options.conninfo, true);
}
if (*config_file_options.log_file)
{
FILE *fd;
log_debug("reopening %s", config_file_options.log_file);
fd = freopen(config_file_options.log_file, "a", stderr);
if (fd == NULL)
{
fprintf(stderr, "error reopening stderr to \"%s\": %s",
config_file_options.log_file, strerror(errno));
}
}
got_SIGHUP = false;
}
static ElectionResult
execute_failover_validation_command(t_node_info *node_info, election_stats *stats)
{
PQExpBufferData failover_validation_command;
PQExpBufferData command_output;
int return_value = -1;
initPQExpBuffer(&failover_validation_command);
initPQExpBuffer(&command_output);
parse_failover_validation_command(config_file_options.failover_validation_command,
node_info,
stats,
&failover_validation_command);
log_notice(_("executing \"failover_validation_command\""));
log_detail("%s", failover_validation_command.data);
/* we determine success of the command by the value placed into return_value */
(void) local_command_return_value(failover_validation_command.data,
&command_output,
&return_value);
termPQExpBuffer(&failover_validation_command);
if (command_output.data[0] != '\0')
{
log_info("output returned by failover validation command:\n%s", command_output.data);
}
else
{
log_info(_("no output returned from command"));
}
termPQExpBuffer(&command_output);
if (return_value != 0)
{
/* create event here? */
log_notice(_("failover validation command returned a non-zero value: %i"),
return_value);
return ELECTION_RERUN;
}
log_notice(_("failover validation command returned zero"));
return ELECTION_WON;
}
static void
parse_failover_validation_command(const char *template, t_node_info *node_info, election_stats *stats, PQExpBufferData *out)
{
const char *src_ptr;
for (src_ptr = template; *src_ptr; src_ptr++)
{
if (*src_ptr == '%')
{
switch (src_ptr[1])
{
case '%':
/* %%: replace with % */
src_ptr++;
appendPQExpBufferChar(out, *src_ptr);
break;
case 'n':
/* %n: node id */
src_ptr++;
appendPQExpBuffer(out, "%i", node_info->node_id);
break;
case 'a':
/* %a: node name */
src_ptr++;
appendPQExpBufferStr(out, node_info->node_name);
break;
case 'v':
/* %v: visible nodes count */
src_ptr++;
appendPQExpBuffer(out, "%i", stats->visible_nodes);
break;
case 'u':
/* %u: shared upstream nodes count */
src_ptr++;
appendPQExpBuffer(out, "%i", stats->shared_upstream_nodes);
break;
case 't':
/* %t: total nodes count */
src_ptr++;
appendPQExpBuffer(out, "%i", stats->all_nodes);
break;
default:
/* otherwise treat the % as not special */
appendPQExpBufferChar(out, *src_ptr);
break;
}
}
else
{
appendPQExpBufferChar(out, *src_ptr);
}
}
return;
}
/*
* Sanity-check whether the local node can follow the proposed upstream node.
*
* Note this function is very similar to check_node_can_attach() in
* repmgr-client.c, however the later is very focussed on client-side
* functionality (including log output related to --dry-run, pg_rewind etc.)
* which we don't want here.
*/
static bool
check_node_can_follow(PGconn *local_conn, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_info)
{
PGconn *local_repl_conn = NULL;
t_system_identification local_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
PGconn *follow_target_repl_conn = NULL;
t_system_identification follow_target_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
TimeLineHistoryEntry *follow_target_history = NULL;
bool can_follow = true;
bool success;
local_repl_conn = establish_replication_connection_from_conn(local_conn, local_node_info.repluser);
if (PQstatus(local_repl_conn) != CONNECTION_OK)
{
log_error(_("unable to establish a replication connection to the local node"));
PQfinish(local_repl_conn);
return false;
}
success = identify_system(local_repl_conn, &local_identification);
PQfinish(local_repl_conn);
if (success == false)
{
log_error(_("unable to query the local node's system identification"));
return false;
}
/* check replication connection */
follow_target_repl_conn = establish_replication_connection_from_conn(follow_target_conn,
follow_target_node_info->repluser);
if (PQstatus(follow_target_repl_conn) != CONNECTION_OK)
{
log_error(_("unable to establish a replication connection to the follow target node"));
PQfinish(follow_target_repl_conn);
return false;
}
/* check system_identifiers match */
if (identify_system(follow_target_repl_conn, &follow_target_identification) == false)
{
log_error(_("unable to query the follow target node's system identification"));
PQfinish(follow_target_repl_conn);
return false;
}
/*
* Check for thing that should never happen, but expect the unexpected anyway.
*/
if (follow_target_identification.system_identifier != local_identification.system_identifier)
{
log_error(_("this node is not part of the follow target node's replication cluster"));
log_detail(_("this node's system identifier is %lu, follow target node's system identifier is %lu"),
local_identification.system_identifier,
follow_target_identification.system_identifier);
PQfinish(follow_target_repl_conn);
return false;
}
/* check timelines */
log_verbose(LOG_DEBUG, "local timeline: %i; follow target timeline: %i",
local_identification.timeline,
follow_target_identification.timeline);
/* upstream's timeline is lower than ours - impossible case */
if (follow_target_identification.timeline < local_identification.timeline)
{
log_error(_("this node's timeline is ahead of the follow target node's timeline"));
log_detail(_("this node's timeline is %i, follow target node's timeline is %i"),
local_identification.timeline,
follow_target_identification.timeline);
PQfinish(follow_target_repl_conn);
return false;
}
/* timeline is the same - check relative positions */
if (follow_target_identification.timeline == local_identification.timeline)
{
XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
if (local_xlogpos == InvalidXLogRecPtr || follow_target_xlogpos == InvalidXLogRecPtr)
{
log_error(_("unable to compare LSN positions"));
PQfinish(follow_target_repl_conn);
return false;
}
if (local_xlogpos <= follow_target_xlogpos)
{
log_info(_("timelines are same, this server is not ahead"));
log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"),
format_lsn(local_xlogpos),
format_lsn(follow_target_xlogpos));
}
else
{
log_error(_("this node is ahead of the follow target"));
log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"),
format_lsn(local_xlogpos),
format_lsn(follow_target_xlogpos));
can_follow = false;
}
}
else
{
/*
* upstream has higher timeline - check where it forked off from this node's timeline
*/
follow_target_history = get_timeline_history(follow_target_repl_conn,
local_identification.timeline + 1);
if (follow_target_history == NULL)
{
/* get_timeline_history() will emit relevant error messages */
PQfinish(follow_target_repl_conn);
return false;
}
log_debug("local tli: %i; local_xlogpos: %X/%X; follow_target_history->tli: %i; follow_target_history->end: %X/%X",
(int)local_identification.timeline,
format_lsn(local_xlogpos),
follow_target_history->tli,
format_lsn(follow_target_history->end));
/*
* Local node has proceeded beyond the follow target's fork, so we
* definitely can't attach.
*
* This could be the case if the follow target was promoted, but does
* not contain all changes which are being replayed to this standby.
*/
if (local_xlogpos > follow_target_history->end)
{
log_error(_("this node cannot attach to follow target node \"%s\" (ID: %i)"),
follow_target_node_info->node_name,
follow_target_node_info->node_id);
can_follow = false;
log_detail(_("follow target server's timeline %lu forked off current database system timeline %lu before current recovery point %X/%X"),
local_identification.system_identifier + 1,
local_identification.system_identifier,
format_lsn(local_xlogpos));
}
if (can_follow == true)
{
log_info(_("local node \"%s\" (ID: %i) can attach to follow target node \"%s\" (ID: %i)"),
config_file_options.node_name,
config_file_options.node_id,
follow_target_node_info->node_name,
follow_target_node_info->node_id);
log_detail(_("local node's recovery point: %X/%X; follow target node's fork point: %X/%X"),
format_lsn(local_xlogpos),
format_lsn(follow_target_history->end));
}
}
PQfinish(follow_target_repl_conn);
if (follow_target_history)
pfree(follow_target_history);
return can_follow;
}
static void
check_witness_attached(t_node_info *node_info, bool startup)
{
/*
* connect and check upstream node id; at this point we don't care if it's
* not reachable, only whether we can mark it as attached or not.
*/
PGconn *witness_conn = establish_db_connection_quiet(node_info->conninfo);
if (PQstatus(witness_conn) == CONNECTION_OK)
{
int witness_upstream_node_id = repmgrd_get_upstream_node_id(witness_conn);
log_debug("witness node %i's upstream node ID reported as %i",
node_info->node_id,
witness_upstream_node_id);
if (witness_upstream_node_id == local_node_info.node_id)
{
node_info->attached = NODE_ATTACHED;
}
else
{
node_info->attached = NODE_DETACHED;
}
}
else
{
node_info->attached = startup == true ? NODE_ATTACHED_UNKNOWN : NODE_DETACHED;
}
PQfinish(witness_conn);
}
static t_child_node_info *
append_child_node_record(t_child_node_info_list *nodes, int node_id, const char *node_name, t_server_type type, NodeAttached attached)
{
t_child_node_info *child_node = pg_malloc0(sizeof(t_child_node_info));
child_node->node_id = node_id;
snprintf(child_node->node_name, sizeof(child_node->node_name), "%s", node_name);
child_node->type = type;
child_node->attached = attached;
if (nodes->tail)
nodes->tail->next = child_node;
else
nodes->head = child_node;
nodes->tail = child_node;
nodes->node_count++;
return child_node;
}
static void
remove_child_node_record(t_child_node_info_list *nodes, int node_id)
{
t_child_node_info *node;
t_child_node_info *prev_node = NULL;
t_child_node_info *next_node = NULL;
node = nodes->head;
while (node != NULL)
{
next_node = node->next;
if (node->node_id == node_id)
{
/* first node */
if (node == nodes->head)
{
nodes->head = next_node;
}
/* last node */
else if (next_node == NULL)
{
prev_node->next = NULL;
}
else
{
prev_node->next = next_node;
}
pfree(node);
nodes->node_count--;
return;
}
else
{
prev_node = node;
}
node = next_node;
}
}
static void
clear_child_node_info_list(t_child_node_info_list *nodes)
{
t_child_node_info *node;
t_child_node_info *next_node;
node = nodes->head;
while (node != NULL)
{
next_node = node->next;
pfree(node);
node = next_node;
}
nodes->head = NULL;
nodes->tail = NULL;
nodes->node_count = 0;
}
static void
parse_child_nodes_disconnect_command(char *parsed_command, char *template, int reporting_node_id)
{
const char *src_ptr = NULL;
char *dst_ptr = NULL;
char *end_ptr = NULL;
dst_ptr = parsed_command;
end_ptr = (parsed_command + MAXPGPATH) - 1;
*end_ptr = '\0';
for (src_ptr = template; *src_ptr; src_ptr++)
{
if (*src_ptr == '%')
{
switch (src_ptr[1])
{
case '%':
/* %%: replace with % */
if (dst_ptr < end_ptr)
{
src_ptr++;
*dst_ptr++ = *src_ptr;
}
break;
case 'p':
/* %p: node id of the reporting primary */
src_ptr++;
snprintf(dst_ptr, end_ptr - dst_ptr, "%i", reporting_node_id);
dst_ptr += strlen(dst_ptr);
break;
}
}
else
{
if (dst_ptr < end_ptr)
*dst_ptr++ = *src_ptr;
}
}
*dst_ptr = '\0';
return;
}
int
try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info)
{
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
int i;
int max_attempts = config_file_options.reconnect_attempts;
initialize_conninfo_params(&conninfo_params, false);
/* we assume by now the conninfo string is parseable */
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
/* set some default values if not explicitly provided */
param_set_ine(&conninfo_params, "connect_timeout", "2");
param_set_ine(&conninfo_params, "fallback_application_name", "repmgr");
for (i = 0; i < max_attempts; i++)
{
time_t started_at = time(NULL);
int up_to;
bool sleep_now = false;
int max_sleep_seconds;
log_info(_("checking state of node \"%s\" (ID: %i), %i of %i attempts"),
node_info->node_name,
node_info->node_id,
i + 1, max_attempts);
if (is_server_available_params(&conninfo_params) == true)
{
PGconn *our_conn;
log_notice(_("node \"%s\" (ID: %i) has recovered, reconnecting"),
node_info->node_name,
node_info->node_id);
/*
* Note: we could also handle the case where node is pingable but
* connection denied due to connection exhaustion, by falling back to
* degraded monitoring (make configurable)
*/
our_conn = establish_db_connection_by_params(&conninfo_params, false);
if (PQstatus(our_conn) == CONNECTION_OK)
{
free_conninfo_params(&conninfo_params);
log_info(_("connection to node \"%s\" (ID: %i) succeeded"),
node_info->node_name,
node_info->node_id);
if (PQstatus(*conn) == CONNECTION_BAD)
{
log_verbose(LOG_INFO, _("original connection handle returned CONNECTION_BAD, using new connection"));
close_connection(conn);
*conn = our_conn;
}
else
{
ExecStatusType ping_result;
ping_result = connection_ping(*conn);
if (ping_result != PGRES_TUPLES_OK)
{
log_info(_("original connection no longer available, using new connection"));
close_connection(conn);
*conn = our_conn;
}
else
{
log_info(_("original connection is still available"));
PQfinish(our_conn);
}
}
node_info->node_status = NODE_STATUS_UP;
return UNKNOWN_NODE_ID;
}
close_connection(&our_conn);
log_notice(_("unable to reconnect to node \"%s\" (ID: %i)"),
node_info->node_name,
node_info->node_id);
}
/*
* Experimental behaviour, see GitHub #662.
*/
if (config_file_options.reconnect_loop_sync == true)
{
up_to = (time(NULL) - started_at);
max_sleep_seconds = (up_to == 0)
? config_file_options.reconnect_interval
: (up_to % config_file_options.reconnect_interval);
if (i + 1 <= max_attempts)
sleep_now = true;
}
else
{
max_sleep_seconds = config_file_options.reconnect_interval;
if (i + 1 < max_attempts)
sleep_now = true;
}
if (sleep_now == true)
{
int j;
log_info(_("sleeping up to %i seconds until next reconnection attempt"),
max_sleep_seconds);
for (j = 0; j < max_sleep_seconds; j++)
{
int new_primary_node_id;
if (get_new_primary(local_conn, &new_primary_node_id) == true && new_primary_node_id != UNKNOWN_NODE_ID)
{
if (new_primary_node_id == ELECTION_RERUN_NOTIFICATION)
{
log_notice(_("received rerun notification"));
}
else
{
log_notice(_("received notification that new primary is node %i"), new_primary_node_id);
}
free_conninfo_params(&conninfo_params);
return new_primary_node_id;
}
sleep(1);
}
}
}
log_warning(_("unable to reconnect to node \"%s\" (ID: %i) after %i attempts"),
node_info->node_name,
node_info->node_id,
max_attempts);
node_info->node_status = NODE_STATUS_DOWN;
free_conninfo_params(&conninfo_params);
return UNKNOWN_NODE_ID;
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/mirrors/repmgr.git
git@gitee.com:mirrors/repmgr.git
mirrors
repmgr
repmgr
master

搜索帮助