[SaltStack] salt-minion启动流程
2015-07-10 15:19
441 查看
SaltStack源码阅读
前面理了下salt-master的启动流程, 这次来看看salt-minion的启动流程.启动salt-minion方法:
/etc/init.d/salt-minion start
看看/etc/init.d/salt-master逻辑:
$ cat /etc/init.d/salt-master SALTMINION=/usr/bin/salt-minion PYTHON=/usr/bin/python MINION_ARGS="" start() { echo -n $"Starting salt-minion daemon: " if [ -f $SUSE_RELEASE ]; then startproc -f -p /var/run/$SERVICE.pid $SALTMINION -d $MINION_ARGS rc_status -v elif [ -e $DEBIAN_VERSION ]; then if [ -f $LOCKFILE ]; then echo -n "already started, lock file found" RETVAL=1 elif $PYTHON $SALTMINION -d $MINION_ARGS >& /dev/null; then echo -n "OK" RETVAL=0 fi else if [[ ! -z "$(pidofproc -p /var/run/$SERVICE.pid $PROCESS)" ]]; then RETVAL=$? echo -n "already running" else daemon --check $SERVICE $SALTMINION -d $MINION_ARGS fi fi RETVAL=$? echo return $RETVAL }
继续看看/usr/bin/salt-minion:
$ cat /usr/bin/salt-master #!/usr/bin/python ''' This script is used to kick off a salt minion daemon ''' from salt.scripts import salt_minion from multiprocessing import freeze_support if __name__ == '__main__': # This handles the bootstrapping code that is included with frozen # scripts. It is a no-op on unfrozen code. freeze_support() salt_minion()
调用salt_minion()方法, 在script.py里:
$ cat scripts.py def salt_minion(): ''' Start the salt minion. ''' import salt.cli.daemons import multiprocessing if '' in sys.path: sys.path.remove('') if salt.utils.is_windows(): minion = salt.cli.daemons.Minion() minion.start() return if '--disable-keepalive' in sys.argv: sys.argv.remove('--disable-keepalive') minion = salt.cli.daemons.Minion() minion.start() return # keep one minion subprocess running while True: try: queue = multiprocessing.Queue() except Exception: # This breaks in containers minion = salt.cli.daemons.Minion() minion.start() return process = multiprocessing.Process(target=minion_process, args=(queue,)) process.start() try: process.join() try: restart_delay = queue.get(block=False) except Exception: if process.exitcode == 0: # Minion process ended naturally, Ctrl+C or --version break restart_delay = 60 if restart_delay == 0: # Minion process ended naturally, Ctrl+C, --version, etc. break # delay restart to reduce flooding and allow network resources to close time.sleep(restart_delay) except KeyboardInterrupt: break # need to reset logging because new minion objects # cause extra log handlers to accumulate rlogger = logging.getLogger() for handler in rlogger.handlers: rlogger.removeHandler(handler) logging.basicConfig()
这里启动minion使用了multiprocessing.Process方法, target是minion_process函数, 主要流程是:
def minion_process(queue): ''' Start a minion process ''' import salt.cli.daemons # salt_minion spawns this function in a new process def suicide_when_without_parent(parent_pid): ''' Have the minion suicide if the parent process is gone NOTE: there is a small race issue where the parent PID could be replace with another process with the same PID! ''' while True: time.sleep(5) try: # check pid alive (Unix only trick!) os.kill(parent_pid, 0) except OSError: # forcibly exit, regular sys.exit raises an exception-- which # isn't sufficient in a thread os._exit(999) if not salt.utils.is_windows(): thread = threading.Thread(target=suicide_when_without_parent, args=(os.getppid(),)) thread.start() restart = False minion = None try: minion = salt.cli.daemons.Minion() minion.start() except (Exception, SaltClientError, SaltReqTimeoutError, SaltSystemExit) as exc: log.error('Minion failed to start: ', exc_info=True) restart = True except SystemExit as exc: restart = False if restart is True: log.warn('** Restarting minion **') delay = 60 if minion is not None: if hasattr(minion, 'config'): delay = minion.config.get('random_reauth_delay', 60) random_delay = randint(1, delay) log.info('Sleeping random_reauth_delay of {0} seconds'.format(random_delay)) # preform delay after minion resources have been cleaned queue.put(random_delay) else: queue.put(0)
这里调用了salt.cli.daemons.Minion的start方法, 目标类文件在: ~/salt/cli/daemons.py
class Minion(parsers.MinionOptionParser): ''' Create a minion server ''' def prepare(self): ''' Run the preparation sequence required to start a salt minion. If sub-classed, don't **ever** forget to run: super(YourSubClass, self).prepare() ''' self.parse_args() try: if self.config['verify_env']: confd = self.config.get('default_include') if confd: # If 'default_include' is specified in config, then use it if '*' in confd: # Value is of the form "minion.d/*.conf" confd = os.path.dirname(confd) if not os.path.isabs(confd): # If configured 'default_include' is not an absolute # path, consider it relative to folder of 'conf_file' # (/etc/salt by default) confd = os.path.join( os.path.dirname(self.config['conf_file']), confd ) else: confd = os.path.join( os.path.dirname(self.config['conf_file']), 'minion.d' ) v_dirs = [ self.config['pki_dir'], self.config['cachedir'], self.config['sock_dir'], self.config['extension_modules'], confd, ] if self.config.get('transport') == 'raet': v_dirs.append(os.path.join(self.config['pki_dir'], 'accepted')) v_dirs.append(os.path.join(self.config['pki_dir'], 'pending')) v_dirs.append(os.path.join(self.config['pki_dir'], 'rejected')) v_dirs.append(os.path.join(self.config['cachedir'], 'raet')) verify_env( v_dirs, self.config['user'], permissive=self.config['permissive_pki_access'], pki_dir=self.config['pki_dir'], ) logfile = self.config['log_file'] if logfile is not None and not logfile.startswith(('tcp://', 'udp://', 'file://')): # Logfile is not using Syslog, verify current_umask = os.umask(0o027) verify_files([logfile], self.config['user']) os.umask(current_umask) except OSError as err: logger.exception('Failed to prepare salt environment') sys.exit(err.errno) self.setup_logfile_logger() logger.info( 'Setting up the Salt Minion "{0}"'.format( self.config['id'] ) ) migrations.migrate_paths(self.config) # TODO: AIO core is separate from transport if self.config['transport'].lower() in ('zeromq', 'tcp'): # Late import so logging works correctly import salt.minion # If the minion key has not been accepted, then Salt enters a loop # waiting for it, if we daemonize later then the minion could halt # the boot process waiting for a key to be accepted on the master. # This is the latest safe place to daemonize self.daemonize_if_required() self.set_pidfile() if isinstance(self.config.get('master'), list): if self.config.get('master_type') == 'failover': self.minion = salt.minion.Minion(self.config) else: self.minion = salt.minion.MultiMinion(self.config) else: self.minion = salt.minion.Minion(self.config) else: import salt.daemons.flo self.daemonize_if_required() self.set_pidfile() self.minion = salt.daemons.flo.IofloMinion(self.config) def start(self): ''' Start the actual minion. If sub-classed, don't **ever** forget to run: super(YourSubClass, self).start() NOTE: Run any required code before calling `super()`. ''' try: self.prepare() if check_user(self.config['user']): logger.info('The salt minion is starting up') self.minion.tune_in() #建立publisher except (KeyboardInterrupt, SaltSystemExit) as exc: logger.warn('Stopping the Salt Minion') if isinstance(exc, KeyboardInterrupt): logger.warn('Exiting on Ctrl-c') else: logger.error(str(exc)) finally: self.shutdown()
在这里minion使用prepare方法准备salt minion参数和环境配置, 通过self.minion.tune_in()尝试建立publisher, 和master建立通信机制, 目标文件在: ~/salt/minion.py
class MultiMinion(MinionBase): ''' 如果minion-conf里配置了多minion, 则会使用MultiMinion来启动minion Create a multi minion interface, this creates as many minions as are defined in the master option and binds each minion object to a respective master. ''' # Multi Master Tune In def tune_in(self): ''' Bind to the masters This loop will attempt to create connections to masters it hasn't connected to yet, but once the initial connection is made it is up to ZMQ to do the reconnect (don't know of an API to get the state here in salt) ''' # Fire off all the minion coroutines self.minions = self._spawn_minions() # serve forever! self.io_loop.start()
这里tune_in()是关键的步骤, minion链接master, 维护minion Pub/Sub socket通信等等.
先看看tune_in()方法:
# Multi Master Tune In def tune_in(self): ''' Bind to the masters This loop will attempt to create connections to masters it hasn't connected to yet, but once the initial connection is made it is up to ZMQ to do the reconnect (don't know of an API to get the state here in salt) ''' # Fire off all the minion coroutines self.minions = self._spawn_minions() # serve forever! self.io_loop.start() #启动io_loop异步事件驱动 def _spawn_minions(self): ''' Spawn all the coroutines which will sign in to masters ''' if not isinstance(self.opts['master'], list): log.error( 'Attempting to start a multimaster system with one master') sys.exit(salt.defaults.exitcodes.EX_GENERIC) for master in set(self.opts['master']): s_opts = copy.deepcopy(self.opts) s_opts['master'] = master s_opts['multimaster'] = True s_opts['auth_timeout'] = self.MINION_CONNECT_TIMEOUT self.io_loop.spawn_callback(self._connect_minion, s_opts) #callback调用_connect_minion def _connect_minion(self, opts): ''' Create a minion, and asynchronously connect it to a master ''' last = 0 # never have we signed in auth_wait = opts['acceptance_wait_time'] while True: try: minion = Minion(opts, self.MINION_CONNECT_TIMEOUT, False, io_loop=self.io_loop, loaded_base_name='salt.loader.{0}'.format(opts['master']), ) yield minion.connect_master() minion.tune_in(start=False) break except SaltClientError as exc: log.error('Error while bringing up minion for multi-master. Is master at {0} responding?'.format(opts['master'])) last = time.time() if auth_wait < self.max_auth_wait: auth_wait += self.auth_wait yield tornado.gen.sleep(auth_wait) # TODO: log? except Exception as e: log.critical('Unexpected error while connecting to {0}'.format(opts['master']), exc_info=True)
到这里大家会发现tune_in进入循环了... 没错! minion server进程会无限循环下去, 维护minion Pub/Sub socket, 保持和salt master的链接, 监听event事件完成Job工作等等.
接下来看看connect_master()方法:
def connect_master(self): ''' Return a future which will complete when you are connected to a master ''' master, self.pub_channel = yield self.eval_master(self.opts, self.timeout, self.safe) yield self._post_master_init(master)
eval_master方法计算并返回master地址以及生成pub_channel;
_post_master_init方法是在链接完成后进行初始化;
在eval_master里, 如果master_type是func, 则会load所有models; 如果master_type是failover, 则会获取master address list, 进而会调用salt.transport.client.AsyncPubChannel.factory()生成pub_channel, 调用connect()方法链接master主机, 具体逻辑是下面这样的:
def eval_master(self, opts, timeout=60, safe=True, failed=False): # check if master_type was altered from its default if opts['master_type'] != 'str': # check for a valid keyword if opts['master_type'] == 'func': #func类型 # split module and function and try loading the module mod, fun = opts['master'].split('.') try: master_mod = salt.loader.raw_mod(opts, mod, fun) #load各个modules if not master_mod: raise TypeError # we take whatever the module returns as master address opts['master'] = master_mod[mod + '.' + fun]() #return master地址 except TypeError: msg = ('Failed to evaluate master address from ' 'module \'{0}\''.format(opts['master'])) log.error(msg) sys.exit(salt.defaults.exitcodes.EX_GENERIC) log.info('Evaluated master from module: {0}'.format(master_mod)) # if failover is set, master has to be of type list elif opts['master_type'] == 'failover': #failover类型 if isinstance(opts['master'], list): ''' 使用isinstance方法, failover链接master ''' log.info('Got list of available master addresses:' ' {0}'.format(opts['master'])) if opts['master_shuffle']: ''' If master is a list of addresses, shuffle them before trying to connect to distribute the minions over all available masters. ''' shuffle(opts['master']) # if opts['master'] is a str and we have never created opts['master_list'] elif isinstance(opts['master'], str) and ('master_list' not in opts): # We have a string, but a list was what was intended. Convert. # See issue 23611 for details opts['master'] = list(opts['master']) elif opts['__role'] == 'syndic':#二级master log.info('Syndic setting master_syndic to \'{0}\''.format(opts['master'])) # if failed=True, the minion was previously connected # we're probably called from the minions main-event-loop # because a master connection loss was detected. remove # the possibly failed master from the list of masters. elif failed: log.info('Removing possibly failed master {0} from list of' ' masters'.format(opts['master'])) # create new list of master with the possibly failed one removed opts['master'] = [x for x in opts['master_list'] if opts['master'] != x] else: msg = ('master_type set to \'failover\' but \'master\' ' 'is not of type list but of type ' '{0}'.format(type(opts['master']))) log.error(msg) sys.exit(salt.defaults.exitcodes.EX_GENERIC) else: msg = ('Invalid keyword \'{0}\' for variable ' '\'master_type\''.format(opts['master_type'])) log.error(msg) sys.exit(salt.defaults.exitcodes.EX_GENERIC) # if we have a list of masters, loop through them and be # happy with the first one that allows us to connect if isinstance(opts['master'], list): conn = False # shuffle the masters and then loop through them local_masters = copy.copy(opts['master']) for master in local_masters: opts['master'] = master opts.update(resolve_dns(opts)) super(Minion, self).__init__(opts) # TODO: only run init once?? This will run once per attempt # on first run, update self.opts with the whole master list # to enable a minion to re-use old masters if they get fixed if 'master_list' not in opts: opts['master_list'] = local_masters try: pub_channel = salt.transport.client.AsyncPubChannel.factory(opts, timeout=timeout, safe=safe, io_loop=self.io_loop, ) yield pub_channel.connect() conn = True break except SaltClientError: ''' 链接master失败, 尝试下一个master链接 ''' msg = ('Master {0} could not be reached, trying ' 'next master (if any)'.format(opts['master'])) log.info(msg) continue if not conn: self.connected = False msg = ('No master could be reached or all masters denied ' 'the minions connection attempt.') log.error(msg) else: self.connected = True raise tornado.gen.Return((opts['master'], pub_channel)) # single master sign in else: opts.update(resolve_dns(opts)) #处理dns解析master ip address pub_channel = salt.transport.client.AsyncPubChannel.factory(self.opts, timeout=timeout, safe=safe, io_loop=self.io_loop, ) yield pub_channel.connect() self.tok = pub_channel.auth.gen_token('salt') self.connected = True raise tornado.gen.Return((opts['master'], pub_channel))
可以看出在上面的逻辑中, 无论是multi master还是single master, 最终都会调用salt.transport.client.AsyncPubChannel类的factory()和connect()方法来完成connect master, 目标文件在: ~/salt/transport/client.py里.
class AsyncPubChannel(AsyncChannel): ''' Factory class to create subscription channels to the master's Publisher ''' @classmethod def factory(cls, opts, **kwargs): # Default to ZeroMQ for now ttype = 'zeromq' # determine the ttype if 'transport' in opts: ttype = opts['transport'] elif 'transport' in opts.get('pillar', {}).get('master', {}): ttype = opts['pillar']['master']['transport'] # switch on available ttypes if ttype == 'zeromq': import salt.transport.zeromq return salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs) elif ttype == 'raet': # TODO: import salt.transport.raet return salt.transport.raet.AsyncRAETPubChannel(opts, **kwargs) elif ttype == 'tcp': if not cls._resolver: # TODO: add opt to specify number of resolver threads AsyncChannel._init_resolver() import salt.transport.tcp return salt.transport.tcp.AsyncTCPPubChannel(opts, **kwargs) elif ttype == 'local': # TODO: import salt.transport.local return salt.transport.local.AsyncLocalPubChannel(opts, **kwargs) else: raise Exception('Channels are only defined for ZeroMQ and raet') # return NewKindOfChannel(opts, **kwargs)
现在基本都是基于zeromq通信的, salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)方法处理pub_channel, 目标类文件在: ~/salt/transport/zeromq.py
class AsyncZeroMQPubChannel(salt.transport.mixins.auth.AESPubClientMixin, salt.transport.client.AsyncPubChannel): ''' A transport channel backed by ZeroMQ for a Salt Publisher to use to publish commands to connected minions ''' def __init__(self, opts, **kwargs): self.opts = opts self.ttype = 'zeromq' if 'io_loop' in kwargs: self.io_loop = kwargs['io_loop'] else: self.io_loop = tornado.ioloop.IOLoop() #IOLoop网络事件 self.hexid = hashlib.sha1(self.opts['id']).hexdigest() self.auth = salt.crypt.AsyncAuth(self.opts, io_loop=self.io_loop) #异步的网络加密验证 self.serial = salt.payload.Serial(self.opts) self.context = zmq.Context() #ZMQ发布订阅模型(PUB/SUB) self._socket = self.context.socket(zmq.SUB) #启动zmq的SUB(client)端socket if self.opts['zmq_filtering']: # TODO: constants file for "broadcast" self._socket.setsockopt(zmq.SUBSCRIBE, 'broadcast') self._socket.setsockopt(zmq.SUBSCRIBE, self.hexid) else: self._socket.setsockopt(zmq.SUBSCRIBE, '') self._socket.setsockopt(zmq.SUBSCRIBE, '') self._socket.setsockopt(zmq.IDENTITY, self.opts['id']) #minion自己的conf中的id # TODO: cleanup all the socket opts stuff if hasattr(zmq, 'TCP_KEEPALIVE'): self._socket.setsockopt( zmq.TCP_KEEPALIVE, self.opts['tcp_keepalive'] ) self._socket.setsockopt( zmq.TCP_KEEPALIVE_IDLE, self.opts['tcp_keepalive_idle'] ) self._socket.setsockopt( zmq.TCP_KEEPALIVE_CNT, self.opts['tcp_keepalive_cnt'] ) self._socket.setsockopt( zmq.TCP_KEEPALIVE_INTVL, self.opts['tcp_keepalive_intvl'] ) recon_delay = self.opts['recon_default'] if self.opts['recon_randomize']: recon_delay = randint(self.opts['recon_default'], self.opts['recon_default'] + self.opts['recon_max'] ) log.debug("Generated random reconnect delay between '{0}ms' and '{1}ms' ({2})".format( self.opts['recon_default'], self.opts['recon_default'] + self.opts['recon_max'], recon_delay) ) log.debug("Setting zmq_reconnect_ivl to '{0}ms'".format(recon_delay)) self._socket.setsockopt(zmq.RECONNECT_IVL, recon_delay) if hasattr(zmq, 'RECONNECT_IVL_MAX'): log.debug("Setting zmq_reconnect_ivl_max to '{0}ms'".format( self.opts['recon_default'] + self.opts['recon_max']) ) self._socket.setsockopt( zmq.RECONNECT_IVL_MAX, self.opts['recon_max'] ) if self.opts['ipv6'] is True and hasattr(zmq, 'IPV4ONLY'): # IPv6 sockets work for both IPv6 and IPv4 addresses self._socket.setsockopt(zmq.IPV4ONLY, 0) if HAS_ZMQ_MONITOR and self.opts['zmq_monitor']: self._monitor = ZeroMQSocketMonitor(self._socket) self._monitor.start_io_loop(self.io_loop) # TODO: this is the time to see if we are connected, maybe use the req channel to guess? @tornado.gen.coroutine def connect(self): if not self.auth.authenticated: #判断是否auth验证 yield self.auth.authenticate() self.publish_port = self.auth.creds['publish_port'] self._socket.connect(self.master_pub) #连接master
看到这里也就明白了, minion在启动后最终使用了ZMQ库的Pub/Sub模型, connect方法链接master机器.
至此salt-minion算是启动起来了 -_-
先去喝杯水, 我想静静 -:)
接下来需要狠狠脑补下zeroMQ等网络通信机制.
From reno
2015-07-10 14:59
相关文章推荐
- struts2 下载文件不能显示中文名字的问题
- [置顶] Linux网络编程入门
- windows bat 定时任务
- php $_SERVER中的SERVER_NAME 和HTTP_HOST的区别
- 理解WebKit和Chromium: 渲染主循环(main loop)和requestAnimationFrame
- XML使用SAX解析与PULL解析的区别
- hadoop_on_glusterfs
- Svn简单操作(持续中)
- Bootstrap总结(一)
- Android网络框架-Volley(二) RequestQueue源码分析以及建立一个RequestQueue
- e陪诊生意好不好
- ROS安装时packages have unmet dependencies问题
- 浮点数 ieee 754
- Fragment的setUserVisibleHint方法实现
- MongoDB YUM 安装配置
- HTML5-离线缓存-升级项目笔记一
- Oracle-BPM(一)
- kafka新的producer api使用
- 编写一个函数,把一个char组成的字符串循环右移n位
- Android的MVP