您的位置:首页 > 其它

seaweedFS之Master节点挂掉导致部分Volume节点消失

2016-02-29 17:24 1426 查看
今天在测试Seaweedfs时候发现一个恐怖的现象。我启动了6个Master和10个Volume。当其中一个Master挂掉的时候,会有N个Volume也看不到了。

问题重现步骤

配置信息

Master节点配置:

六个Master节点。

占用端口 9001-9006

启动命令:

/data/weed/bin/weed master -defaultReplication="100" -mdir="/data/weed/runtime/c1/master2" -port=9002 -peers="127.0.0.1:9001,

127.0.0.1:9002,

127.0.0.1:9003,

127.0.0.1:9004,

127.0.0.1:9005,

127.0.0.1:9006"




Volume节点配置:

10个Volume节点

占用端口:9051-9055、9061-9065

启动命令

/data/weed/bin/weed volume -dataCenter="c2" -rack="r2" -dir="/data/weed/runtime/c2/volume/2" -port=9062 -max="5" -mserver="127.0.0.1:9001"




重现步骤

启动所有节点



查看seaweedfs配置信息,10个Volume节点全部在线



停掉一个非Leader的Master节点



查看seaweedfs配置信息,c2的r2和r4消失了。



问题原因

为了查找问题的原因,特意从https://github.com/chrislusf/seaweedfs下载了源代码

1.首先查看
weed/volume.go
文件内容(128行)

//这里创建了一个VolumeServer。
volumeServer := weed_server.NewVolumeServer(volumeMux, publicVolumeMux,
*v.ip, *v.port, *v.publicUrl,
v.folders, v.folderMaxLimits,
volumeNeedleMapKind,
*v.master, *v.pulseSeconds, *v.dataCenter, *v.rack,
v.whiteList,
*v.fixJpgOrientation, *v.readRedirect,
)


2.在
weed/weed_server/volume_server.go
中找到
NewVolumeServer
方法(28-102行)

//同步执行了一个循环
go func() {
connected := true

glog.V(0).Infof("Volume server bootstraps with master %s", vs.GetMasterNode())
vs.store.SetBootstrapMaster(vs.GetMasterNode())
vs.store.SetDataCenter(vs.dataCenter)
vs.store.SetRack(vs.rack)
for {
glog.V(4).Infof("Volume server sending to master %s", vs.GetMasterNode())
//在这里进行了心跳检测
master, secretKey, err := vs.store.SendHeartbeatToMaster()
if err == nil { //如果心跳检测没有发生错误,并且没有连接,则执行连接操作
if !connected {
connected = true
vs.SetMasterNode(master)
vs.guard.SecretKey = secretKey
glog.V(0).Infoln("Volume Server Connected with master at", master)
}
} else { //如果心跳检测发生异常,设置成未连接状态,但是不会将MasterNode设为无效。
glog.V(1).Infof("Volume Server Failed to talk with master %s: %v", vs.masterNode, err)
if connected {
connected = false
}
}
//循环等待时间
if connected {
time.Sleep(time.Duration(float32(vs.pulseSeconds*1e3)*(1+rand.Float32())) * time.Millisecond)
} else {
time.Sleep(time.Duration(float32(vs.pulseSeconds*1e3)*0.25) * time.Millisecond)
}
}
}()


3.在
storage/store.go
中找到
SendHeartbeatToMaster
方法(261-339行)

//查找Master节点
masterNode, e = s.masterNodes.findMaster()
if e != nil {
return
}
var volumeMessages []*operation.VolumeInformationMessage
maxVolumeCount := 0
var maxFileKey uint64
for _, location := range s.Locations {
maxVolumeCount = maxVolumeCount + location.MaxVolumeCount
for k, v := range location.volumes {
if maxFileKey < v.nm.MaxFileKey() {
maxFileKey = v.nm.MaxFileKey()
}
if !v.expired(s.volumeSizeLimit) {
volumeMessage := &operation.VolumeInformationMessage{
Id:               proto.Uint32(uint32(k)),
Size:             proto.Uint64(uint64(v.Size())),
Collection:       proto.String(v.Collection),
FileCount:        proto.Uint64(uint64(v.nm.FileCount())),
DeleteCount:      proto.Uint64(uint64(v.nm.DeletedCount())),
DeletedByteCount: proto.Uint64(v.nm.DeletedSize()),
ReadOnly:         proto.Bool(v.readOnly),
ReplicaPlacement: proto.Uint32(uint32(v.ReplicaPlacement.Byte())),
Version:          proto.Uint32(uint32(v.Version())),
Ttl:              proto.Uint32(v.Ttl.ToUint32()),
}
volumeMessages = append(volumeMessages, volumeMessage)
} else {
if v.exiredLongEnough(MAX_TTL_VOLUME_REMOVAL_DELAY) {
s.DeleteVolume(location.volumes, v)
glog.V(0).Infoln("volume", v.Id, "is deleted.")
} else {
glog.V(0).Infoln("volume", v.Id, "is expired.")
}
}
}
}
joinMessage := &operation.JoinMessage{
IsInit:         proto.Bool(!s.connected),
Ip:             proto.String(s.Ip),
Port:           proto.Uint32(uint32(s.Port)),
PublicUrl:      proto.String(s.PublicUrl),
MaxVolumeCount: proto.Uint32(uint32(maxVolumeCount)),
MaxFileKey:     proto.Uint64(maxFileKey),
DataCenter:     proto.String(s.dataCenter),
Rack:           proto.String(s.rack),
Volumes:        volumeMessages,
}

data, err := proto.Marshal(joinMessage)
if err != nil {
return "", "", err
}
//加入Master节点
joinUrl := "http://" + masterNode + "/dir/join"
glog.V(4).Infof("Connecting to %s ...", joinUrl)

jsonBlob, err := util.PostBytes(joinUrl, data)
if err != nil {
s.masterNodes.reset()
return "", "", err
}
var ret operation.JoinResult
if err := json.Unmarshal(jsonBlob, &ret); err != nil {
glog.V(0).Infof("Failed to join %s with response: %s", joinUrl, string(jsonBlob))
s.masterNodes.reset()
return masterNode, "", err
}
if ret.Error != "" {
s.masterNodes.reset()
return masterNode, "", errors.New(ret.Error)
}
s.volumeSizeLimit = ret.VolumeSizeLimit
secretKey = security.Secret(ret.SecretKey)
s.connected = true
return


4.在
storage/store.go
中找到
findMaster
方法(52-76行)

func (mn *MasterNodes) findMaster() (string, error) {
if len(mn.nodes) == 0 {
return "", errors.New("No master node found!")
}
//如果最后一个node小于0,才会在里面注入新的node
if mn.lastNode < 0 {
for _, m := range mn.nodes {
glog.V(4).Infof("Listing masters on %s", m)
//获取所有的masters节点
if masters, e := operation.ListMasters(m); e == nil {
if len(masters) == 0 {
continue
}
mn.nodes = append(masters, m)
//随机设置一个master,即该Volume随机注册到一个Master节点上面。
mn.lastNode = rand.Intn(len(mn.nodes))
glog.V(2).Infof("current master nodes is %v", mn)
break
} else {
glog.V(4).Infof("Failed listing masters on %s: %v", m, e)
}
}
}
if mn.lastNode < 0 {
return "", errors.New("No master node available!")
}
//返回lastNode为Master节点
return mn.nodes[mn.lastNode], nil
}


总结

Volume节点会随机注册到一个Master节点。在
weed/weed_server/volume_server.go
调用
SendHeartbeatToMaster
出现异常之后,只是将Volume的状态设置成未连接了,并没有将所注册到的Master标记失效/注销。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: