标签:指定 发送 节点 opp event signed setting paxos 介绍
众所周知,OSD是通过心跳检测机制,确保OSD是在线的,即OSD Peer之间,其实形成了彼此监控的网络,每 6秒(osd_heartbeat_interval向Peer)发送心跳信息,如果超过20秒(osd_heartbeat_grace) 时间没收到Peer OSD的心跳信息,则send_failure,状告该OSD已经fail。本章节主题:那么mon是靠什么来监控彼此存活的呢?
答案是lease机制。
以下从二个角度较少mon的lease机制。
每次延长租约多长时间: mon_lease 5s
mon_lease超时因子: mon_lease_ack_timeout_factor 2
最长多久发送一次lease消息因子: mon_lease_renew_interval_factor 0.6
mon选举超时时间: mon_election_timeout 5s
1.超时重新选举的timeout时间是多久? //mon_lease_ack_timeout_factor * mon_lease: 2*5=10s
2.最长多久发送一次lease消息? //mon_lease_renew_interval_factor * mon_lease:0.6*5=3s
以下从二个方面介绍,分别是mon(leader)和mon(peon)角度。
lease这个功能的切入点是extend_lease函数。
//运行在mon的leader节点
void Paxos::extend_lease()
{
assert(mon->is_leader());//leader
//assert(is_active());
lease_expire = ceph_clock_now();
lease_expire += g_conf->mon_lease;//5s
acked_lease.clear();
acked_lease.insert(mon->rank);
dout(7) << "extend_lease now+" << g_conf->mon_lease
<< " (" << lease_expire << ")" << dendl;
// bcast
/*向所有的peon发送OP_LEASE消息,消息体中带上lease_expire */
for (set<int>::const_iterator p = mon->get_quorum().begin();
p != mon->get_quorum().end(); ++p) {
if (*p == mon->rank) continue;
MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE,
ceph_clock_now());
lease->last_committed = last_committed;
lease->lease_timestamp = lease_expire;
lease->first_committed = first_committed;
mon->messenger->send_message(lease, mon->monmap->get_inst(*p));
}
// set timeout event.
// if old timeout is still in place, leave it.
if (!lease_ack_timeout_event) {//注册lease_ack_timeout_event 事件,超时时间为10s
lease_ack_timeout_event = mon->timer.add_event_after(
g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,//2*5=10s
new C_MonContext(mon, [this](int r) {
if (r == -ECANCELED)
return;
lease_ack_timeout();// 超时回调函数
}));
}
// set renew event
utime_t at = lease_expire;
at -= g_conf->mon_lease;
at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;//0.6*5=3s
lease_renew_event = mon->timer.add_event_at(//注册lease_renew_event 事件,超时时间为3s
at, new C_MonContext(mon, [this](int r) {
if (r == -ECANCELED)
return;
lease_renew_timeout();// 超时回调函数
}));
}
//mon leader端
void Paxos::lease_ack_timeout()
{
dout(1) << "lease_ack_timeout -- calling new election" << dendl;
assert(mon->is_leader());
assert(is_active());
logger->inc(l_paxos_lease_ack_timeout);
lease_ack_timeout_event = 0;
mon->bootstrap();//开始触发选举
}
void Paxos::lease_renew_timeout()
{
lease_renew_event = 0;
extend_lease();// 3s超时后,继续执行下一轮,正常情况下,每隔3s触发一次OP_LEASE事件。
}
//mon的启动函数,开始发起选举
void Monitor::bootstrap()
{
// reset
state = STATE_PROBING;// 设置状态为STATE_PROBING
// probe monitors
dout(10) << "probing other monitors" << dendl;
for (unsigned i = 0; i < monmap->size(); i++) {//向其他mon发送OP_PROBE信号,之后会调用选举函数,这里不细说。
if ((int)i != rank)
messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined),
monmap->get_inst(i));
}
}
总结:这里每隔1s(paxos_propose_interval)左右,由PaxosService::dispatch触发调用函数Paxos::extend_lease,发送OP_LEASE事件,如果超时lease_renew_timeout,即最长3s时间进行OP_LEASE事件,发到其他所有非主mon。如果超过10s主mon没有收到其他所有mon的ack,超时,开始进行mon选举。
void Paxos::handle_lease_ack(MonOpRequestRef op)
{
op->mark_paxos_event("handle_lease_ack");
MMonPaxos *ack = static_cast<MMonPaxos*>(op->get_req());
int from = ack->get_source().num();
if (!lease_ack_timeout_event) {
dout(10) << "handle_lease_ack from " << ack->get_source()
<< " -- stray (probably since revoked)" << dendl;
} else if (acked_lease.count(from) == 0) {
acked_lease.insert(from);
if (ack->feature_map.length()) {
auto p = ack->feature_map.begin();
FeatureMap& t = mon->quorum_feature_map[from];
::decode(t, p);
}
if (acked_lease == mon->get_quorum()) {//最后一个peon的消息也收到了,那么没有超时,就取消掉lease_ack_timeout_event事件
// yay!
dout(10) << "handle_lease_ack from " << ack->get_source()
<< " -- got everyone" << dendl;
mon->timer.cancel_event(lease_ack_timeout_event);
lease_ack_timeout_event = 0;
} else {//并不是所有的peon都返回了
dout(10) << "handle_lease_ack from " << ack->get_source()
<< " -- still need "
<< mon->get_quorum().size() - acked_lease.size()
<< " more" << dendl;
}
} else {//重复ack 取消
dout(10) << "handle_lease_ack from " << ack->get_source()
<< " dup (lagging!), ignoring" << dendl;
}
warn_on_future_time(ack->sent_timestamp, ack->get_source());
}
对于peon节点而言,收到OP_LEASE消息,是讨论的起点:
// peon
void Paxos::handle_lease(MonOpRequestRef op)
{
op->mark_paxos_event("handle_lease");
MMonPaxos *lease = static_cast<MMonPaxos*>(op->get_req());
// sanity
if (!mon->is_peon() ||
last_committed != lease->last_committed) {
dout(10) << "handle_lease i‘m not a peon, or they‘re not the leader,"
<< " or the last_committed doesn‘t match, dropping" << dendl;
op->mark_paxos_event("invalid lease, ignore");
return;
}
warn_on_future_time(lease->sent_timestamp, lease->get_source());
// extend lease
if (lease_expire < lease->lease_timestamp) {//延长lease 到mon leader指定的时间*
lease_expire = lease->lease_timestamp;
utime_t now = ceph_clock_now();
if (lease_expire < now) {//此时可能发生mon laggy或者时钟skew
utime_t diff = now - lease_expire;
derr << "lease_expire from " << lease->get_source_inst() << " is " << diff << " seconds in the past; mons are probably laggy (or possibly clocks are too skewed)" << dendl;
}
}
state = STATE_ACTIVE;
dout(10) << "handle_lease on " << lease->last_committed
<< " now " << lease_expire << dendl;
// ack /*发送OP_LEASE_ACK消息到mon leader*/
MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK,
ceph_clock_now());
ack->last_committed = last_committed;
ack->first_committed = first_committed;
ack->lease_timestamp = ceph_clock_now();
::encode(mon->session_map.feature_map, ack->feature_map);
lease->get_connection()->send_message(ack);
// (re)set timeout event. 如果该定时器超时了,表示在过去的mon_lease_ack_timeout时间内,没有收到任何的OP_LEASE消息,基本可以确定mon leader出问题了
reset_lease_timeout();
}
//接上
void Paxos::reset_lease_timeout()
{
dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
if (lease_timeout_event)
mon->timer.cancel_event(lease_timeout_event);
lease_timeout_event = mon->timer.add_event_after(
g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,//2*5s=10s
new C_MonContext(mon, [this](int r) {
if (r == -ECANCELED)
return;
lease_timeout();//注册回调函数
}));
}
//mon peon端
void Paxos::lease_timeout()
{
dout(1) << "lease_timeout -- calling new election" << dendl;
assert(mon->is_peon());
logger->inc(l_paxos_lease_timeout);
lease_timeout_event = 0;
mon->bootstrap();//发起选举
}
mon的 leader和peon是互相监督,peon对monitor leader的监督,体现在reset_lease_timeout函数。他会以收到OP_LEASE消息的时间为起点,注册一个超时时间为mon_lease_ack_timeout的定时事件。如果该定时器超时了,表示在过去的mon_lease_ack_timeout时间内,没有收到任何的OP_LEASE消息,基本可以确定mon leader出问题了,发起选举。
标签:指定 发送 节点 opp event signed setting paxos 介绍
原文地址:https://blog.51cto.com/wendashuai/2517286