docker study --- runc

his blog is based on the master branck of ruc. I had fork it and save it under https://github.com/savagecm/runc.

There are four background process background: docker daemon、docker-containerd、docker-containerd-shim and docker-runc.

this part is mainly for docker-runc.

source code:

the entry point is under

app.Commands = []cli.Command{
app.Before = func(context *cli.Context) error {
if context.GlobalBool("debug") {
if path := context.GlobalString("log"); path != "" {
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666)
if err != nil {
return err
switch context.GlobalString("log-format") {
case "text":
// retain logrus's default.
case "json":
return fmt.Errorf("unknown log-format %q", context.GlobalString("log-format"))
return nil
// If the command returns an error, cli takes upon itself to print
// the error on cli.ErrWriter and exit.
// Use our own writer here to ensure the log gets sent to the right location.
cli.ErrWriter = &FatalWriter{cli.ErrWriter}
if err := app.Run(os.Args); err != nil {

we define the supported command here.match the related command according to the parameter passed in. if the command passed in is "create" command, that is related to createCommand.

createCommand contains the related

package main

import (


var createCommand = cli.Command{
Name:  "create",
Usage: "create a container",
ArgsUsage: `<container-id>
Where "<container-id>" is your name for the instance of the container that you
are starting. The name you provide for the container instance must be unique on
your host.`,
Description: `The create command creates an instance of a container for a bundle. The bundle
is a directory with a specification file named "` + specConfig + `" and a root
The specification file includes an args parameter. The args parameter is used
to specify command(s) that get run when the container is started. To change the
command(s) that get executed on start, edit the args parameter of the spec. See
"runc spec --help" for more explanation.`,
Flags: []cli.Flag{
Name:  "bundle, b",
Value: "",
Usage: `path to the root of the bundle directory, defaults to the current directory`,
Name:  "console-socket",
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
Name:  "pid-file",
Value: "",
Usage: "specify the file to write the process id to",
Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
Name:  "no-new-keyring",
Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
Action: func(context *cli.Context) error {
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
if err := revisePidFile(context); err != nil {
return err
spec, err := setupSpec(context)
if err != nil {
return err
status, err := startContainer(context, spec, true)
if err != nil {
return err
// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
return nil


// Context is a type that is passed through to
// each Handler action in a cli application. Context
// can be used to retrieve context-specific Args and
// parsed command-line options.
type Context struct {
App           *App
Command       Command
shellComplete bool
flagSet       *flag.FlagSet
setFlags      map[string]bool
parentContext *Context


// Spec is the base configuration for the container.
type Spec struct {
// Version of the Open Container Runtime Specification with which the bundle complies.
Version string `json:"ociVersion"`
// Platform specifies the configuration's target platform.
Platform Platform `json:"platform"`
// Process configures the container process.
Process Process `json:"process"`
// Root configures the container's root filesystem.
Root Root `json:"root"`
// Hostname configures the container's hostname.
Hostname string `json:"hostname,omitempty"`
// Mounts configures additional mounts (on top of Root).
Mounts []Mount `json:"mounts,omitempty"`
// Hooks configures callbacks for container lifecycle events.
Hooks *Hooks `json:"hooks,omitempty"`
// Annotations contains arbitrary metadata for the container.
Annotations map[string]string `json:"annotations,omitempty"`

// Linux is platform specific configuration for Linux based containers.
Linux *Linux `json:"linux,omitempty" platform:"linux"`
// Solaris is platform specific configuration for Solaris containers.
Solaris *Solaris `json:"solaris,omitempty" platform:"solaris"`
// Windows is platform specific configuration for Windows based containers, including Hyper-V containers.
Windows *Windows `json:"windows,omitempty" platform:"windows"`

take create command as an example. it mainly calls fucntion

func startContainer(context *cli.Context, spec *specs.Spec, create bool) (int, error) {
id := context.Args().First()
if id == "" {
return -1, errEmptyID
container, err := createContainer(context, id, spec)
if err != nil {
return -1, err
// Support on-demand socket activation by passing file descriptors into the container init process.
listenFDs := []*os.File{}
if os.Getenv("LISTEN_FDS") != "" {
listenFDs = activation.Files(false)
r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy:   true,
container:       container,
listenFDs:       listenFDs,
consoleSocket:   context.String("console-socket"),
detach:          context.Bool("detach"),
pidFile:         context.String("pid-file"),
create:          create,
return r.run(&spec.Process)

definition of

// Process contains information to start a specific application inside the container.
type Process struct {
// Terminal creates an interactive terminal for the container.
Terminal bool `json:"terminal,omitempty"`
// ConsoleSize specifies the size of the console.
ConsoleSize Box `json:"consoleSize,omitempty"`
// User specifies user information for the process.
User User `json:"user"`
// Args specifies the binary and arguments for the application to execute.
Args []string `json:"args"`
// Env populates the process environment for the process.
Env []string `json:"env,omitempty"`
// Cwd is the current working directory for the process and must be
// relative to the container's root.
Cwd string `json:"cwd"`
// Capabilities are Linux capabilities that are kept for the container.
Capabilities []string `json:"capabilities,omitempty" platform:"linux"`
// Rlimits specifies rlimit options to apply to the process.
Rlimits []LinuxRlimit `json:"rlimits,omitempty" platform:"linux"`
// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
// ApparmorProfile specifies the apparmor profile for the container.
ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
// SelinuxLabel specifies the selinux context that the container process is run as.
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`

startContainer() mainly create a container object, initializerunner object, call run function of runner.

1. create container object

here is

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName:       id,
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
NoPivotRoot:      context.Bool("no-pivot"),
NoNewKeyring:     context.Bool("no-new-keyring"),
Spec:             spec,
if err != nil {
return nil, err

factory, err := loadFactory(context)
if err != nil {
return nil, err
return factory.Create(id, config)


1. load LinuxFactory.


// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
root := context.GlobalString("root")
abs, err := filepath.Abs(root)
if err != nil {
return nil, err
cgroupManager := libcontainer.Cgroupfs
if context.GlobalBool("systemd-cgroup") {
if systemd.UseSystemd() {
cgroupManager = libcontainer.SystemdCgroups
} else {
return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu")))


func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
l := &LinuxFactory{
Root:      root,
InitArgs:  []string{"/proc/self/exe", "init"},
Validator: validate.New(),
CriuPath:  "criu",
for _, opt := range options {
if err := opt(l); err != nil {
return nil, err
return l, nil
// LinuxFactory implements the default factory interface for linux based systems.
type LinuxFactory struct {
    // Root directory for the factory to store state.
    Root string

    // InitArgs are arguments for calling the init responsibilities for spawning
    // a container.
    InitArgs []string

    // CriuPath is the path to the criu binary used for checkpoint and restore of
    // containers.
    CriuPath string

    // Validator provides validation to container configurations.
    Validator validate.Validator

    // NewCgroupsManager returns an initialized cgroups manager for a single container.
    NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager

the /proc/self/exe init  it will call the process itself

InitArgs:  []string{"/proc/self/exe", "init"},

2. call Create command

type Factory interface {
// Creates a new container with the given id and starts the initial process inside it.
// id must be a string containing only letters, digits and underscores and must contain
// between 1 and 1024 characters, inclusive.
// The id must not already be in use by an existing container. Containers created using
// a factory with the same path (and file system) must have distinct ids.
// Returns the new container with a running process.
// errors:
// IdInUse - id is already in use by a container
// InvalidIdFormat - id has incorrect format
// ConfigInvalid - config is invalid
// Systemerror - System error
// On error, any partially created container parts are cleaned up (the operation is atomic).
Create(id string, config *configs.Config) (Container, error)


func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
if err := l.validateID(id); err != nil {
return nil, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
uid, err := config.HostUID()
if err != nil {
return nil, newGenericError(err, SystemError)
gid, err := config.HostGID()
if err != nil {
return nil, newGenericError(err, SystemError)
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
return nil, newGenericError(err, SystemError)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
 c := &linuxContainer{
id:            id,
root:          containerRoot,
config:        config,
initArgs:      l.InitArgs,
criuPath:      l.CriuPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
c.state = &stoppedState{c: c}
return c, nil

2. call run

func (r *runner) run(config *specs.Process) (int, error) {
// Maxx: new process instance
process, err := newProcess(*config)
if err != nil {
return -1, err
if len(r.listenFDs) > 0 {
process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)

rootuid, err := r.container.Config().HostUID()
if err != nil {
return -1, err

rootgid, err := r.container.Config().HostGID()
if err != nil {
return -1, err

detach := r.detach || r.create

// Check command-line for sanity.
if detach && config.Terminal && r.consoleSocket == "" {
return -1, fmt.Errorf("cannot allocate tty if runc will detach without setting console socket")
// XXX: Should we change this?
if (!detach || !config.Terminal) && r.consoleSocket != "" {
return -1, fmt.Errorf("cannot use console socket if runc will not detach or allocate tty")
//Maxx note here  this will call start
startFn := r.container.Start
if !r.create {
startFn = r.container.Run
// Setting up IO is a two stage process. We need to modify process to deal
// with detaching containers, and then we get a tty after the container has
// started.
handler := newSignalHandler(r.enableSubreaper)
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach)
if err != nil {
return -1, err
// // Maxx   call container start
// D:\study\go\runc-master\libcontainer\container_linux.go 看调用的run方法
if err := startFn(process); err != nil {
return -1, err
if config.Terminal {
if err := tty.recvtty(process, r.detach || r.create); err != nil {
return -1, err
defer tty.Close()

if config.Terminal && detach {
conn, err := net.Dial("unix", r.consoleSocket)
if err != nil {
return -1, err
defer conn.Close()

unixconn, ok := conn.(*net.UnixConn)
if !ok {
return -1, fmt.Errorf("casting to UnixConn failed")

socket, err := unixconn.File()
if err != nil {
return -1, err
defer socket.Close()

err = tty.sendtty(socket, r.terminalinfo())
if err != nil {
return -1, err

if err := tty.ClosePostStart(); err != nil {
return -1, err
if r.pidFile != "" {
if err := createPidFile(r.pidFile, process); err != nil {
return -1, err
if detach {
return 0, nil
status, err := handler.forward(process, tty)
if err != nil {
return status, err

func (c *linuxContainer) Start(process *Process) error {
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
return c.start(process, status == Stopped)

note: for the first time, it will call the following Run()

func (c *linuxContainer) Run(process *Process) error {
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
if err := c.start(process, status == Stopped); err != nil {
return err
if status == Stopped {
return c.exec()
return nil


func (c *linuxContainer) start(process *Process, isInit bool) error {
// Maxx construct process object
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
// Maxx init container process
 if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
return newSystemErrorWithCause(err, "starting container process")
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
c.state = &runningState{
c: c,
if isInit {
c.state = &createdState{
c: c,
state, err := c.updateState(parent)
if err != nil {
return err
c.initProcessStartTime = state.InitProcessStartTime

if c.config.Hooks !=
nil {
s := configs.HookState{
Version:    c.config.Version,
ID:         c.id,
Pid:        parent.pid(),
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
return nil


func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
// Maxx create pipeline between parent and child
 parentPipe, childPipe, err := newPipe()
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
rootDir, err := os.Open(c.root)
if err != nil {
return nil, err
// Maxx /proc/self/exe $path of runc init
 cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
// Maxx init the first process in the container
 return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)


func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
_, sharePidns := nsMaps[configs.NEWPID]
// Maxx set the clone flag here
 data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
p.consoleChan = make(chan *os.File, 1)
return &initProcess{
cmd:           cmd,
childPipe:     childPipe,
parentPipe:    parentPipe,
manager:       c.cgroupManager,
config:        c.newInitConfig(p),
container:     c,
process:       p,
bootstrapData: data,
sharePidns:    sharePidns,
rootDir:       rootDir,
}, nil

newInitProcess() return

type initProcess struct {
cmd           *exec.Cmd
parentPipe    *os.File
childPipe     *os.File
config        *initConfig
manager       cgroups.Manager
container     *linuxContainer
fds           []string
process       *Process
bootstrapData io.Reader
sharePidns    bool
rootDir       *os.File
start function of initProcess:

func (p *initProcess) start() error {
defer p.parentPipe.Close()
// Maxx /proc/self/exe path_to_runc init
err := p.cmd.Start()
p.process.ops = p
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
// Maxx 通过pipe向nsexec传输信息包含namespace 信息
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
// Maxx  wait setns stop
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()).  If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
// Maxx get pid of nsexec return
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
// Maxx 前面调用过cmd.start,通过调用/proc/self/exe 传入init参数
// 通过管道向子进程传递配置 initCommand  will use
// D:\study\go\runc-master\main.go
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
var (
sentRun    bool
sentResume bool

ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procConsole:
if err := writeSync(p.parentPipe, procConsoleReq); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'request fd'")

masterFile, err := utils.RecvFd(p.parentPipe)
if err != nil {
return newSystemErrorWithCause(err, "getting master pty from child pipe")

if p.process.consoleChan == nil {
// TODO: Don't panic here, do something more sane.
panic("consoleChan is nil")
p.process.consoleChan <- masterFile

if err := writeSync(p.parentPipe, procConsoleAck); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'ack fd'")
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version:    p.container.config.Version,
ID:         p.container.id,
Pid:        p.pid(),
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
// Sync with child.
if err := writeSync(p.parentPipe, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
sentRun = true
case procHooks:
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version:    p.container.config.Version,
ID:         p.container.id,
Pid:        p.pid(),
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
// Sync with child.
if err := writeSync(p.parentPipe, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
sentResume = true
return newSystemError(fmt.Errorf("invalid JSON payload from child"))

return nil

if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")

// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
return ierr
return nil

a little summary:

actually the command is /proc/self/exe $runc_path init, that is call runc itself again and pass init as its argument. So the next step is to look in to the initCommand of runc.

Note:when do we create the child process?

As we defined the cgo code in the main_unix.go, before calling p.cmd.Start(), it will call the nsexec method in thensexec.c
description of nsenter:

nsexec will get bootstrap data(namespace paths, clone flags, uid/gid mapping, console path)from its father process and clone a child process, and return the processid.

when the nsexec finish running . p.cmd.Start() will execute. and pass config info via pipeline.


runc will call

var initCommand = cli.Command{
Name:  "init",
Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
Action: func(context *cli.Context) error {
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
// as the error is sent back to the parent there is no need to log
// or write it to stderr because the parent process will handle this
panic("libcontainer: container init failed to exec")
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
l := &LinuxFactory{
Root:      root,
InitArgs:  []string{"/proc/self/exe", "init"},
Validator: validate.New(),
CriuPath:  "criu",
// Maxx  LinuxFactory.Create()中构造了linuxContainer对象

for _, opt := range options {
if err := opt(l); err != nil {
return nil, err
return l, nil
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
var pipefd, rootfd int
for _, pair := range []struct {
k string
v *int
} {

s := os.Getenv(pair.k)

i, err := strconv.Atoi(s)
if err != nil {
return fmt.Errorf("unable to convert %s=%s to int", pair.k, s)
*pair.v = i
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
defer pipe.Close()

// clear the current process's environment to clean any libcontainer
// specific env vars.

defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
fmt.Fprintln(os.Stderr, err)
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
fmt.Fprintln(os.Stderr, err)
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
// Maxx 通过管道解析父进程传来的信息  config
i, err := newContainerInit(it, pipe, rootfd)
if err != nil {
return err

// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init()

func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
if err := populateProcessEnvironment(config.Env); err != nil {
return nil, err
switch t {
case initSetns:
return &linuxSetnsInit{
pipe:       pipe,
config:     config,
stateDirFD: stateDirFD,
}, nil
case initStandard:
return &linuxStandardInit{
pipe:       pipe,
parentPid:  syscall.Getppid(),
config:     config,
stateDirFD: stateDirFD,
}, nil
return nil, fmt.Errorf("unknown init type %q", t)


type linuxStandardInit struct {
pipe       *os.File
parentPid  int
stateDirFD int
config     *initConfig
func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()

// do not inherit the parent's session keyring
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
// make session keyring searcheable
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
// Maxx  setup network

if err := setupNetwork(l.config); err != nil {
return err
if err := setupRoute(l.config.Config); err != nil {
return err


// prepareRootfs() can be executed only for a new mount namespace.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
return err

// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.pipe, l.config, true); err != nil {
return err
if err := system.Setctty(); err != nil {
return err

// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err

if hostname := l.config.Config.Hostname; hostname != "" {
if err := syscall.Sethostname([]byte(hostname)); err != nil {
return err
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err

for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return err
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path); err != nil {
return err
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return err
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
if err := finalizeNamespace(l.config); err != nil {
return err
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return err
// compare the parent from the initial start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
// check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
// close the pipe to signal that we have completed our init.
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
if _, err := syscall.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
//Maxx  用户进程
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
return nil

